From e2e7c482e805cd4b8ac07fd1929bc2f9ee20e9a1 Mon Sep 17 00:00:00 2001
From: github-action-benchmark <github@users.noreply.github.com>
Date: Fri, 7 Jun 2024 07:48:48 +0000
Subject: [PATCH] add smaller_is_better (customSmallerIsBetter) benchmark
 result for 87571b8be8105738d6da87df053d5a32e7fa001e

---
 dev/bench/data.js | 33162 ++++++++++++++++++++++----------------------
 1 file changed, 16581 insertions(+), 16581 deletions(-)

diff --git a/dev/bench/data.js b/dev/bench/data.js
index bc64ed525e6d0..90374865a933a 100644
--- a/dev/bench/data.js
+++ b/dev/bench/data.js
@@ -1,5 +1,5 @@
 window.BENCHMARK_DATA = {
-  "lastUpdate": 1717746519706,
+  "lastUpdate": 1717746526986,
   "repoUrl": "https://github.com/neuralmagic/nm-vllm",
   "entries": {
     "bigger_is_better": [
@@ -68310,668 +68310,668 @@ window.BENCHMARK_DATA = {
           "timestamp": "2024-04-23T14:46:41Z",
           "url": "https://github.com/neuralmagic/nm-vllm/commit/df1f1a00d1fb111ef035ac385fafa38b5ed34488"
         },
-        "date": 1714210771537,
+        "date": 1714297078376,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 14671.440407500086,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 66931.1053829997,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:35:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 06:02:43 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 208.6643606973424,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 23342.27604626999,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:35:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 06:02:43 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 161.46753849898232,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 17800.132582000515,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:35:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 06:02:43 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 117.89186615758388,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 261.657417060096,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:35:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 06:02:43 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 124.39165958450867,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 278.97989142772093,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:35:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 06:02:43 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 6387.269492000087,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 6064.163899500272,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:00:11 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:45:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 111.88932599993375,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 121.84322424331792,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:00:11 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:45:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 72.5244825007394,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 81.91542849999678,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:00:11 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:45:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 36.289724544148655,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 39.81237977005924,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:00:11 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:45:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 37.01978473213134,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 39.49480608319446,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:00:11 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:45:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 1923.126683501323,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 3666.834318499241,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:16:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:12:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 79.84467063327126,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 143.11174139328313,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:16:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:12:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 40.41554399918823,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 91.59698149960604,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:16:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:12:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 11.005281552743925,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 24.36734761260045,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:16:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:12:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 11.09402611059665,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 23.649514210886167,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:16:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:12:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 2136.2335784997413,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 6355.360378500336,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:48:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:03:50 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 136.25811479666177,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 111.07957002678936,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:48:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:03:50 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 104.31731850076176,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 69.13049949980632,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:48:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:03:50 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 15.10960127592157,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 36.273124083950954,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:48:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:03:50 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 13.650743723656545,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 36.95699933018845,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:48:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:03:50 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 18574.00114200027,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 6063.323772000331,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:57:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:10:01 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 1102.113974073327,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 127.74053738003205,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:57:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:10:01 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 230.7276385017758,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 92.22327899988159,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:57:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:10:01 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 162.38298922781706,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 40.47740103380456,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:57:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:10:01 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 148.92584730583258,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 40.41175559645828,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:57:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:10:01 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 62434.5530745004,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:24:46 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 40341.59530111335,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:24:46 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 40089.88794800007,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:24:46 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 99.23333490961022,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:24:46 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 101.38731865845863,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:24:46 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 6134.139074499217,
+            "value": 6138.0041939992225,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:36:11 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:35:02 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 106.36771252658946,
+            "value": 105.25370153986538,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:36:11 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:35:02 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 68.1048379992717,
+            "value": 69.34573000035016,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:36:11 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:35:02 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 34.66033560242653,
+            "value": 34.66916330861514,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:36:11 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:35:02 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 35.17876881436496,
+            "value": 35.20249453031695,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:36:11 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:35:02 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 65613.32472899994,
+            "value": 67696.56554000176,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:33:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:33:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 28628.61416851538,
+            "value": 30013.547229894015,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:33:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:33:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 27332.809500499934,
+            "value": 29051.331959499294,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:33:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:33:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 188.24308156207644,
+            "value": 189.95778387274478,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:33:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:33:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 199.30614640370118,
+            "value": 200.71795366500223,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:33:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:33:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 6062.0934375001525,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 257689.3706384999,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:50:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:08:07 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 122.8672970640667,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 243934.78863560208,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:50:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:08:07 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 94.20262949970493,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 244595.2843140003,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:50:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:08:07 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 55.13576839101936,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 69.76695993422113,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:50:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:08:07 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 48.08502645026652,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 67.07681470882945,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:50:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:08:07 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 77644.25305650092,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 6068.786497999099,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:54:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:49:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 58056.55871913598,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 121.80484077736523,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:54:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:49:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 71003.32071900084,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 89.96812399891496,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:54:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:49:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 68.41126599025908,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 55.036998853002125,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:54:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:49:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 65.81709570907654,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 48.161595403597964,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:54:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:49:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 66410.8578134992,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 2142.5964224999916,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 06:03:40 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:47:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 23003.429712153353,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 136.95423646659037,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 06:03:40 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:47:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 17349.811404999855,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 105.06599900054425,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 06:03:40 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:47:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 261.2103598160443,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 15.088356492075098,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 06:03:40 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:47:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 278.55816558679993,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 13.655097184058752,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 06:03:40 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:47:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 5222.535258501011,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 1923.5307070002818,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:42:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:15:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 89.90427876022295,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 79.52589756001544,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:42:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:15:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 61.31367800117005,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 37.84988749976037,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:42:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:15:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 37.250890948022324,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 10.990196552600976,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:42:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:15:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 31.679306717872336,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 11.101862315424263,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:42:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:15:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 7859.990413498963,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 14727.956755001287,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:17:40 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:34:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 156.30002720930983,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 207.43928320336272,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:17:40 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:34:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 127.74674149932252,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 161.8715450003947,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:17:40 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:34:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 58.084073885518116,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 118.73434108304961,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:17:40 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:34:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 57.587868500857205,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 125.11435313409407,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:17:40 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:34:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 5386.467451000499,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 5239.008678499886,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:23:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:41:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 182.94118360667682,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 90.91267326681798,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:23:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:41:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 172.65122749995498,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 61.70038650088827,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:23:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:41:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 40.44215306390788,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 37.28207667565892,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:23:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:41:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 37.65400143078365,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 31.687763794898334,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:23:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:41:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 12297.73155500061,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 1815.862186999766,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:55:16 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:21:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 198.2033804333405,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 92.85708308672838,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:55:16 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:21:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 171.63947199969698,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 59.3996994994086,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:55:16 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:21:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 96.09748940261052,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 12.503594600157154,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:55:16 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:21:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 101.07422222877577,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 11.726245442219327,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:55:16 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:21:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 6068.778347000261,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 2432.085828500931,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:46:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:27:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 121.58755756997077,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 115.97731625733529,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:46:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:27:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 81.96738949936844,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 83.34300799924677,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:46:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:27:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 39.834716322933176,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 18.038354873429853,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:46:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:27:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 39.42931719131392,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 16.147303457152088,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:46:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:27:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 6054.284574499434,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 2071.159581500069,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:11:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:41:31 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 127.06042101664936,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 117.38934138680634,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:11:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:41:31 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 92.53861100114591,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 86.24154350036406,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:11:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:41:31 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 40.60141152783043,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 11.614447459022692,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:11:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:41:31 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 40.35647327143947,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 11.609165830648937,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:11:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:41:31 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 1814.8084434997145,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 18468.7140394999,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:22:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:57:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 93.60767734337666,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 1110.133476052722,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:22:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:57:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 57.111170000098355,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 236.71987249872473,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:22:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:57:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 12.478057171616896,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 162.36971628466682,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:22:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:57:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 11.78352964388927,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 149.287468383281,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:22:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:57:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 62489.75667349987,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 7876.454225500311,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:25:55 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:16:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 40302.325046974685,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 156.95443960801884,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:25:55 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:16:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 39308.59702949965,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 128.45143799859216,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:25:55 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:16:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 99.83318207948984,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 57.94276547439565,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:25:55 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:16:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 102.22480738465657,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 57.508369824266325,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 04:25:55 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:16:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 2070.6088160004583,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 78565.39773550048,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:42:42 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:53:24 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 116.26469750008255,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 58711.33112383332,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:42:42 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:53:24 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 86.99318649996712,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 72157.52103149952,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:42:42 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:53:24 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 11.632297370645555,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 68.35338342569004,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:42:42 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:53:24 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 11.61245543034712,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 65.91079877096972,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:42:42 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:53:24 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 2426.99449849988,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 5395.843203001277,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:28:48 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:22:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 116.47309425734177,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 183.65219166668734,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:28:48 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:22:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 80.91874300043855,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 178.78151750119287,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:28:48 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:22:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 18.093533888949484,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 40.48714043924613,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:28:48 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:22:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 16.203817817727327,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 37.688588111216575,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:28:48 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:22:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 258293.34289049986,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:09:33 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 243556.67614118863,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:09:33 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 244923.2257779995,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:09:33 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 69.73269317950292,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:09:33 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 66.83747040314358,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-27 05:09:33 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 3653.525684001579,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 12238.293557499674,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:13:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:54:19 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 143.22962458001712,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 199.42883660927086,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:13:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:54:19 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 92.1650234995468,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 168.41479350023292,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:13:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:54:19 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 24.40717868159896,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 94.97464477132647,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:13:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:54:19 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 23.733869918782684,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 99.14474897894794,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-27 06:13:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:54:19 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
@@ -68992,668 +68992,668 @@ window.BENCHMARK_DATA = {
           "timestamp": "2024-04-23T14:46:41Z",
           "url": "https://github.com/neuralmagic/nm-vllm/commit/df1f1a00d1fb111ef035ac385fafa38b5ed34488"
         },
-        "date": 1714297078376,
+        "date": 1714383658525,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 66931.1053829997,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 5403.3334414980345,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 06:02:43 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:25:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 23342.27604626999,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 182.6672527786577,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 06:02:43 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:25:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 17800.132582000515,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 173.26183499972103,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 06:02:43 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:25:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 261.657417060096,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 40.570686681999014,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 06:02:43 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:25:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 278.97989142772093,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 37.71272982877713,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 06:02:43 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:25:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 6064.163899500272,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 14712.270560000434,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:45:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:36:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 121.84322424331792,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 211.4012420333347,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:45:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:36:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 81.91542849999678,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 165.1276724996933,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:45:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:36:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 39.81237977005924,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 118.61316128523288,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:45:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:36:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 39.49480608319446,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 125.23053596180378,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:45:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:36:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 3666.834318499241,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 12388.316153499545,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:12:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:56:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 143.11174139328313,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 202.90839387460193,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:12:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:56:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 91.59698149960604,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 177.19953449977766,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:12:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:56:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 24.36734761260045,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 95.76608171097477,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:12:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:56:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 23.649514210886167,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 100.6277065136866,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:12:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:56:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 6355.360378500336,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 7894.943926500673,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:03:50 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:19:22 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 111.07957002678936,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 157.38730911329912,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:03:50 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:19:22 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 69.13049949980632,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 131.29129100070713,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:03:50 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:19:22 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 36.273124083950954,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 58.145339403127906,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:03:50 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:19:22 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 36.95699933018845,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 57.77174892640247,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:03:50 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:19:22 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 6063.323772000331,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 6143.9237764998325,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:10:01 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:37:51 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 127.74053738003205,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 105.57952202662516,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:10:01 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:37:51 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 92.22327899988159,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 68.90616300006513,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:10:01 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:37:51 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 40.47740103380456,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 34.740065797105615,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:10:01 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:37:51 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 40.41175559645828,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 35.27362721178171,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:10:01 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:37:51 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 62434.5530745004,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 2431.906657499894,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:24:46 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:30:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 40341.59530111335,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 116.1085367613535,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:24:46 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:30:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 40089.88794800007,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 80.23967450026248,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:24:46 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:30:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 99.23333490961022,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 18.105119612849144,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:24:46 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:30:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 101.38731865845863,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 16.194528842399823,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:24:46 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:30:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 6138.0041939992225,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 78082.65650449993,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:35:02 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:56:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 105.25370153986538,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 58163.472767429324,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:35:02 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:56:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 69.34573000035016,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 71294.46411749996,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:35:02 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:56:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 34.66916330861514,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 67.35790102715542,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:35:02 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:56:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 35.20249453031695,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 65.52902275067585,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:35:02 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:56:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 67696.56554000176,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 5234.401947000151,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:33:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:43:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 30013.547229894015,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 90.68538420006614,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:33:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:43:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 29051.331959499294,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 60.826272501799394,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:33:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:43:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 189.95778387274478,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 37.25110775777427,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:33:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:43:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 200.71795366500223,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 31.682919228131336,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:33:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:43:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 257689.3706384999,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 2065.334467500179,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:08:07 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:44:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 243934.78863560208,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 117.37230603336987,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:08:07 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:44:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 244595.2843140003,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 86.2206050005625,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:08:07 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:44:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 69.76695993422113,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 11.59097868551506,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:08:07 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:44:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 67.07681470882945,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 11.618711720487603,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:08:07 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:44:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 6068.786497999099,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 67333.70489449863,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:49:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 06:05:06 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 121.80484077736523,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 23449.42444652001,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:49:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 06:05:06 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 89.96812399891496,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 17718.90502399947,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:49:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 06:05:06 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 55.036998853002125,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 262.3360423793547,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:49:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 06:05:06 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 48.161595403597964,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 279.50231642284837,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:49:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 06:05:06 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 2142.5964224999916,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 18693.89975500053,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:47:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:59:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 136.95423646659037,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 1106.037561508017,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:47:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:59:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 105.06599900054425,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 234.87662999832537,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:47:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:59:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 15.088356492075098,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 162.23267729135304,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:47:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:59:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 13.655097184058752,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 149.22055482149233,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:47:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:59:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 1923.5307070002818,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 61705.24036599909,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:15:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:27:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 79.52589756001544,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 39382.883654935344,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:15:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:27:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 37.84988749976037,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 38853.11500399985,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:15:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:27:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 10.990196552600976,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 98.6048637101342,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:15:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:27:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 11.101862315424263,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 100.93879064096944,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:15:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:27:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 14727.956755001287,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 6106.1727380001685,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:34:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:47:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 207.43928320336272,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 122.2248624533313,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:34:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:47:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 161.8715450003947,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 82.13618149875401,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:34:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:47:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 118.73434108304961,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 39.866295219973715,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:34:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:47:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 125.11435313409407,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 39.483253903939236,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:34:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:47:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 5239.008678499886,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 1841.4298955003687,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:41:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:24:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 90.91267326681798,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 92.76224013333679,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:41:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:24:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 61.70038650088827,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 59.39676350044465,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:41:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:24:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 37.28207667565892,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 12.53072772650871,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:41:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:24:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 31.687763794898334,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 11.7504803795625,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:41:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:24:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 1815.862186999766,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 1928.7328335003622,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:21:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:18:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 92.85708308672838,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 79.24429737999162,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:21:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:18:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 59.3996994994086,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 38.83016149939067,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:21:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:18:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 12.503594600157154,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 11.01682881146633,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:21:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:18:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 11.726245442219327,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 11.104867589473509,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:21:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:18:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 2432.085828500931,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 6375.923833499655,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:27:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:02:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 115.97731625733529,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 109.70417828670179,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:27:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:02:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 83.34300799924677,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 70.59495649991732,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:27:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:02:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 18.038354873429853,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 36.29284477344846,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:27:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:02:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 16.147303457152088,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 36.98958335851306,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:27:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:02:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 2071.159581500069,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 260014.52012900062,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:41:31 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:10:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 117.38934138680634,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 244655.706315512,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:41:31 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:10:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 86.24154350036406,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 246336.86162399955,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:41:31 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:10:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 11.614447459022692,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 70.77495113818273,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:41:31 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:10:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 11.609165830648937,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 66.8261656403469,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:41:31 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:10:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 18468.7140394999,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 3645.2240139988135,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:57:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:14:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 1110.133476052722,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 143.44058583311076,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:57:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:14:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 236.71987249872473,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 94.05171199978213,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:57:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:14:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 162.36971628466682,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 24.43651480224753,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:57:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:14:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 149.287468383281,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 23.768530231824148,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:57:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:14:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 7876.454225500311,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 6056.600670500302,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:16:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:12:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 156.95443960801884,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 128.3224846033469,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:16:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:12:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 128.45143799859216,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 87.35010950022115,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:16:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:12:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 57.94276547439565,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 40.54763684967656,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:16:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:12:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 57.508369824266325,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 40.42104585396235,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:16:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:12:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 78565.39773550048,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 6055.8608940009435,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:53:24 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:51:48 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 58711.33112383332,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 122.8573354986826,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:53:24 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:51:48 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 72157.52103149952,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 92.14041750055912,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:53:24 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:51:48 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 68.35338342569004,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 55.07886411780935,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:53:24 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:51:48 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 65.91079877096972,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 48.0826652607532,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 04:53:24 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:51:48 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 5395.843203001277,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 66309.31746399801,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:22:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:35:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 183.65219166668734,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 29019.913823834024,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:22:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:35:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 178.78151750119287,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 27652.122268498715,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:22:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:35:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 40.48714043924613,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 188.22594207093397,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:22:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:35:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 37.688588111216575,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 199.4164384190758,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-28 06:22:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:35:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 12238.293557499674,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 2122.4544235001304,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:54:19 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:50:07 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 199.42883660927086,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 136.2453950600563,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:54:19 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:50:07 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 168.41479350023292,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 104.65170849965943,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:54:19 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:50:07 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 94.97464477132647,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 15.058391072166453,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:54:19 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:50:07 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 99.14474897894794,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 13.626770453407055,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-28 05:54:19 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:50:07 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
@@ -69674,4078 +69674,4078 @@ window.BENCHMARK_DATA = {
           "timestamp": "2024-04-23T14:46:41Z",
           "url": "https://github.com/neuralmagic/nm-vllm/commit/df1f1a00d1fb111ef035ac385fafa38b5ed34488"
         },
-        "date": 1714383658525,
+        "date": 1714470075644,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
             "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 5403.3334414980345,
+            "value": 5376.062333001755,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:25:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:24:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 182.6672527786577,
+            "value": 184.5827273187315,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:25:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:24:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 173.26183499972103,
+            "value": 173.52721449969977,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:25:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:24:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 40.570686681999014,
+            "value": 40.43196687663791,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:25:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:24:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 37.71272982877713,
+            "value": 37.58095798642143,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:25:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:24:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 14712.270560000434,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 12320.661744000063,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:36:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:55:03 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 211.4012420333347,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 200.01820418526768,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:36:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:55:03 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 165.1276724996933,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 169.44416600108525,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:36:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:55:03 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 118.61316128523288,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 96.04773220335846,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:36:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:55:03 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 125.23053596180378,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 101.17233322299779,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:36:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:55:03 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 12388.316153499545,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 3652.386217499952,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:56:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:14:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 202.90839387460193,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 142.93804023662233,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:56:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:14:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 177.19953449977766,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 95.80902800007607,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:56:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:14:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 95.76608171097477,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 24.498444220468098,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:56:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:14:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 100.6277065136866,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 23.784562093061254,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:56:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:14:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 7894.943926500673,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 1823.0353210010435,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:19:22 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:22:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 157.38730911329912,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 93.06960186000651,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:19:22 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:22:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 131.29129100070713,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 59.665249999852676,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:19:22 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:22:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 58.145339403127906,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 12.502380206719828,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:19:22 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:22:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 57.77174892640247,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 11.783106806446016,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:19:22 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:22:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 6143.9237764998325,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 5238.431358498929,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:37:51 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:42:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 105.57952202662516,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 91.44519897992723,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:37:51 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:42:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 68.90616300006513,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 60.88910249854962,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:37:51 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:42:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 34.740065797105615,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 37.252881617494005,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:37:51 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:42:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 35.27362721178171,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 31.70269005355473,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:37:51 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:42:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 2431.906657499894,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 1929.8616509995554,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:30:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:16:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 116.1085367613535,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 79.59311796000596,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:30:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:16:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 80.23967450026248,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 39.459064500078966,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:30:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:16:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 18.105119612849144,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 10.979780574097273,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:30:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:16:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 16.194528842399823,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 11.09183504894853,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:30:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:16:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 78082.65650449993,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 65573.8574650004,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:56:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:34:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 58163.472767429324,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 28950.78831299604,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:56:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:34:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 71294.46411749996,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 28142.69032300217,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:56:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:34:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 67.35790102715542,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 188.86090202522692,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:56:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:34:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 65.52902275067585,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 199.33138919321786,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:56:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:34:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 5234.401947000151,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 6378.783236000345,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:43:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:00:43 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 90.68538420006614,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 108.38548640673253,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:43:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:00:43 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 60.826272501799394,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 71.50023299982422,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:43:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:00:43 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 37.25110775777427,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 36.32068798008349,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:43:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:00:43 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 31.682919228131336,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 36.99483870421225,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:43:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:00:43 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 2065.334467500179,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 6047.278685499805,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:44:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:11:09 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 117.37230603336987,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 125.4039507166696,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:44:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:11:09 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 86.2206050005625,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 87.60409249862278,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:44:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:11:09 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 11.59097868551506,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 40.662599829984984,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:44:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:11:09 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 11.618711720487603,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 40.43363331252289,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:44:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:11:09 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 67333.70489449863,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 6089.519108001696,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 06:05:06 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:51:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 23449.42444652001,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 120.69910238930122,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 06:05:06 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:51:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 17718.90502399947,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 89.65837299911072,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 06:05:06 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:51:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 262.3360423793547,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 55.06700430067017,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 06:05:06 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:51:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 279.50231642284837,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 48.04969955440588,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 06:05:06 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:51:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 18693.89975500053,
+            "value": 18383.634987500045,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:59:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:58:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 1106.037561508017,
+            "value": 977.7821168053584,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:59:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:58:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 234.87662999832537,
+            "value": 233.23141549917636,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:59:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:58:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 162.23267729135304,
+            "value": 161.4755453013489,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:59:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:58:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 149.22055482149233,
+            "value": 147.98108700900505,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:59:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:58:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 61705.24036599909,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 78107.6898225001,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:27:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:54:38 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 39382.883654935344,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 58225.91374190133,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:27:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:54:38 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 38853.11500399985,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 71513.63180799945,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:27:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:54:38 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 98.6048637101342,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 67.80498533429089,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:27:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:54:38 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 100.93879064096944,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 65.44014757829031,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:27:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:54:38 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 6106.1727380001685,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 66338.3866639997,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:47:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 06:04:42 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 122.2248624533313,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 22944.870291643343,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:47:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 06:04:42 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 82.13618149875401,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 17367.390752000574,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:47:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 06:04:42 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 39.866295219973715,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 260.65817349493784,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:47:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 06:04:42 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 39.483253903939236,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 277.99167895448284,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:47:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 06:04:42 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 1841.4298955003687,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 2134.466259499277,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:24:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:48:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 92.76224013333679,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 136.62190305008457,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:24:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:48:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 59.39676350044465,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 102.67750049933966,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:24:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:48:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 12.53072772650871,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 15.099409057440004,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:24:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:48:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 11.7504803795625,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 13.654440891257662,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:24:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:48:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 1928.7328335003622,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 2408.684269999867,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:18:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:28:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 79.24429737999162,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 116.16479182931411,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:18:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:28:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 38.83016149939067,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 80.27312500053085,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:18:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:28:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 11.01682881146633,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 18.106101958196053,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:18:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:28:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 11.104867589473509,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 16.139953761015917,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:18:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:28:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 6375.923833499655,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 14822.945272499965,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:02:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:35:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 109.70417828670179,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 205.64598047202162,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:02:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:35:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 70.59495649991732,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 163.80199600098422,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:02:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:35:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 36.29284477344846,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 119.56480416917998,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:02:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:35:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 36.98958335851306,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 126.0499936903422,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:02:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:35:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 260014.52012900062,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 7886.070769500293,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:10:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:17:41 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 244655.706315512,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 157.57009318131654,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:10:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:17:41 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 246336.86162399955,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 132.7795765000701,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:10:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:17:41 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 70.77495113818273,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 58.182222314526086,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:10:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:17:41 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 66.8261656403469,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 57.73655366013833,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:10:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:17:41 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 3645.2240139988135,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 6141.3096454998595,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:14:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:36:13 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 143.44058583311076,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 104.48978885987647,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:14:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:36:13 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 94.05171199978213,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 67.92814349955734,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:14:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:36:13 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 24.43651480224753,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 34.73078370484817,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:14:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:36:13 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 23.768530231824148,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 35.22087641082219,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:14:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:36:13 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 6056.600670500302,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 259394.88207350043,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:12:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:09:19 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 128.3224846033469,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 244259.34381698666,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:12:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:09:19 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 87.35010950022115,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 245936.9741119999,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:12:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:09:19 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 40.54763684967656,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 70.70314073132613,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:12:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:09:19 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 40.42104585396235,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 66.68409539982376,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 04:12:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:09:19 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 6055.8608940009435,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 6071.920400999261,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:51:48 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:46:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 122.8573354986826,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 121.85457794001802,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:51:48 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:46:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 92.14041750055912,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 78.61482199950842,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:51:48 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:46:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 55.07886411780935,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 39.801395855077615,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:51:48 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:46:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 48.0826652607532,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 39.502406777722285,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:51:48 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:46:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 66309.31746399801,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 63004.24120249954,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:35:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:25:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 29019.913823834024,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 40527.186443927385,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:35:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:25:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 27652.122268498715,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 39565.95233650023,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:35:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:25:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 188.22594207093397,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 99.86915523887204,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:35:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:25:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 199.4164384190758,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 102.12643246719269,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-29 06:35:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:25:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 2122.4544235001304,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 2066.5399399995295,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:50:07 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:42:30 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 136.2453950600563,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 116.18510289994447,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:50:07 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:42:30 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 104.65170849965943,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 86.63544850060134,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:50:07 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:42:30 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 15.058391072166453,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 11.6430866408856,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:50:07 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:42:30 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 13.626770453407055,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
+            "value": 11.652037467004137,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-29 05:50:07 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:42:30 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
       {
         "commit": {
           "author": {
-            "name": "Andy Linfoot",
-            "username": "andy-neuma",
-            "email": "78757007+andy-neuma@users.noreply.github.com"
+            "name": "Robert Shaw",
+            "username": "robertgshaw2-neuralmagic",
+            "email": "114415538+robertgshaw2-neuralmagic@users.noreply.github.com"
           },
           "committer": {
             "name": "GitHub",
             "username": "web-flow",
             "email": "noreply@github.com"
           },
-          "id": "df1f1a00d1fb111ef035ac385fafa38b5ed34488",
-          "message": "switch to GCP based build VM (#201)\n\nSUMMARY:\r\n* switch over to GCP VM's for building stage of \"remote push\"\r\n\r\nNOTE: this is just the start. i'll redo the benchmarking and nightly\r\nworkflows in an upcoming PR.\r\n\r\nTEST PLAN:\r\nruns on remote push\r\n\r\nCo-authored-by: andy-neuma <andy@neuralmagic.com>",
-          "timestamp": "2024-04-23T14:46:41Z",
-          "url": "https://github.com/neuralmagic/nm-vllm/commit/df1f1a00d1fb111ef035ac385fafa38b5ed34488"
+          "id": "d485d3e5c9721b27cd0fe345d062fabc038049a1",
+          "message": "Marlin 2:4 Downstream (for v0.3 release) (#239)\n\nSupport marlin 2:4 in downstream so we can have it in the release",
+          "timestamp": "2024-05-14T01:54:06Z",
+          "url": "https://github.com/neuralmagic/nm-vllm/commit/d485d3e5c9721b27cd0fe345d062fabc038049a1"
         },
-        "date": 1714470075644,
+        "date": 1715671992207,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 5376.062333001755,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7699.20140399995,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:24:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:31:20 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 184.5827273187315,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 155.60859181600426,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:24:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:31:20 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 173.52721449969977,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 126.44680549999521,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:24:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:31:20 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 40.43196687663791,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 57.35804097393765,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:24:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:31:20 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 37.58095798642143,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 57.06640501978451,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:24:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:31:20 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 12320.661744000063,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 14952.524137000182,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:55:03 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 04:10:54 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 200.01820418526768,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 213.97269461333295,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:55:03 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 04:10:54 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 169.44416600108525,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 170.41449899988947,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:55:03 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 04:10:54 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 96.04773220335846,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 121.74922727076351,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:55:03 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 04:10:54 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 101.17233322299779,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 127.51590181598549,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:55:03 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 04:10:54 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 3652.386217499952,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 3631.2452459997075,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:14:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:20:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 142.93804023662233,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 143.76172412334503,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:14:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:20:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 95.80902800007607,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 95.35134399993694,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:14:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:20:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 24.498444220468098,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.310515078398957,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:14:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:20:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 23.784562093061254,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 23.651293949528196,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:14:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:20:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 1823.0353210010435,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5856.753418500375,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:22:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:57:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 93.06960186000651,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 118.7610248973045,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:22:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:57:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 59.665249999852676,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 89.27816299910774,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:22:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:57:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 12.502380206719828,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 53.309391464699814,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:22:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:57:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 11.783106806446016,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 46.510393959855044,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:22:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:57:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 5238.431358498929,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16020.202677499583,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:42:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 05:04:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 91.44519897992723,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 253.06591161863494,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:42:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 05:04:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 60.88910249854962,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 155.56535249925219,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:42:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 05:04:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 37.252881617494005,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 146.14549104973028,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:42:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 05:04:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 31.70269005355473,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 134.71175487648534,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:42:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 05:04:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 1929.8616509995554,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2412.483122499907,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:16:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:37:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 79.59311796000596,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 116.41906162533633,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:16:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:37:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 39.459064500078966,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 83.04159249973964,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:16:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:37:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 10.979780574097273,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.954717798726882,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:16:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:37:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 11.09183504894853,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.191531535676557,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:16:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:37:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 65573.8574650004,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1928.175570500116,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:34:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:58:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 28950.78831299604,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 95.48966796332327,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:34:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:58:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 28142.69032300217,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.23218350042225,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:34:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:58:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 188.86090202522692,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.258276658293926,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:34:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:58:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 199.33138919321786,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.51521060171141,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:34:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:58:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 6378.783236000345,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6352.16404099998,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:00:43 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:18:22 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 108.38548640673253,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 110.16861786666747,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:00:43 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:18:22 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 71.50023299982422,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.68160549998947,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:00:43 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:18:22 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 36.32068798008349,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.22538058499631,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:00:43 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:18:22 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 36.99483870421225,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.91675834190811,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:00:43 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:18:22 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 6047.278685499805,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1940.8526524998706,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:11:09 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:25:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 125.4039507166696,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 79.21267474664396,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:11:09 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:25:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 87.60409249862278,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.89376600020478,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:11:09 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:25:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 40.662599829984984,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.089562012528129,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:11:09 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:25:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 40.43363331252289,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.173363486138213,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:11:09 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:25:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 6089.519108001696,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 63194.19884299987,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:51:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:40:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 120.69910238930122,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 27375.426378377993,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:51:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:40:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 89.65837299911072,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 25736.021327001254,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:51:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:40:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 55.06700430067017,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 185.86486330113644,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:51:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:40:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 48.04969955440588,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 197.56442768654986,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:51:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:40:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 18383.634987500045,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 75939.70493999995,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:58:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:04:17 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 977.7821168053584,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 56341.681981972004,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:58:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:04:17 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 233.23141549917636,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69316.22169849992,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:58:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:04:17 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 161.4755453013489,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.55430329170304,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:58:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:04:17 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 147.98108700900505,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 64.33734177531761,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-04-30 06:58:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:04:17 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 78107.6898225001,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2033.819562499957,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:54:38 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:51:26 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 58225.91374190133,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 82.19037280665968,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:54:38 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:51:26 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 71513.63180799945,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38.89878250038237,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:54:38 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:51:26 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 67.80498533429089,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.560121123356186,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:54:38 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:51:26 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 65.44014757829031,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.691221148500436,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:54:38 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:51:26 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 66338.3866639997,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2571.6090910000275,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 06:04:42 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 04:04:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 22944.870291643343,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 120.15518762534703,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 06:04:42 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 04:04:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 17367.390752000574,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 81.6943180002454,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 06:04:42 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 04:04:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 260.65817349493784,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19.216760619551575,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 06:04:42 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 04:04:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 277.99167895448284,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.232633518539803,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 06:04:42 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 04:04:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 2134.466259499277,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5996.506612500014,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:48:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:24:48 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 136.62190305008457,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 127.50568606667495,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:48:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:24:48 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 102.67750049933966,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 88.20934350001153,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:48:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:24:48 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 15.099409057440004,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.21830872028567,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:48:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:24:48 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 13.654440891257662,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.20192526693389,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:48:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:24:48 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 2408.684269999867,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5082.261738499255,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:28:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:49:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 116.16479182931411,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 88.498999799934,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:28:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:49:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 80.27312500053085,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 60.69171000035567,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:28:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:49:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 18.106101958196053,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.21554315710666,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:28:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:49:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 16.139953761015917,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 30.829622089502823,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:28:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:49:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 14822.945272499965,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13589.161211999908,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:35:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:44:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 205.64598047202162,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 203.4137675746612,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:35:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:44:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 163.80199600098422,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 159.45165350012758,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:35:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:44:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 119.56480416917998,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 106.80662248274147,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:35:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:44:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 126.0499936903422,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 111.48762767348046,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:35:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:44:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 7886.070769500293,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 254818.33761899997,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:17:41 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:18:54 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 157.57009318131654,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 241011.8444389933,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:17:41 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:18:54 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 132.7795765000701,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 242041.27989450013,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:17:41 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:18:54 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 58.182222314526086,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.88961185379736,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:17:41 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:18:54 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 57.73655366013833,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.35204241527413,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:17:41 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:18:54 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 6141.3096454998595,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 60430.702900000026,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:36:13 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:39:29 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 104.48978885987647,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38304.701958351994,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:36:13 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:39:29 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 67.92814349955734,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37548.55232750003,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:36:13 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:39:29 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 34.73078370484817,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 97.69063523534015,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:36:13 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:39:29 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 35.22087641082219,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 100.23740548629765,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:36:13 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:39:29 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 259394.88207350043,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6053.155197000024,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:09:19 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:55:49 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 244259.34381698666,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 121.90334264000285,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:09:19 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:55:49 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 245936.9741119999,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 81.16031500003373,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:09:19 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:55:49 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 70.70314073132613,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.63900264773282,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:09:19 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:55:49 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 66.68409539982376,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.268328228789144,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:09:19 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:55:49 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 6071.920400999261,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5320.154192500013,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:46:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:30:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 121.85457794001802,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 181.2350845293695,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:46:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:30:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 78.61482199950842,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 171.38535149933887,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:46:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:30:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 39.801395855077615,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.67783502514967,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:46:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:30:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 39.502406777722285,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.79312972752162,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:46:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:30:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 63004.24120249954,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1828.8207245000194,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:25:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:31:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 40527.186443927385,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 92.21822712335475,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:25:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:31:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 39565.95233650023,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 56.458014999861916,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:25:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:31:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 99.86915523887204,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.639844913203472,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:25:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:31:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 102.12643246719269,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.911983000670325,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 04:25:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:31:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 2066.5399399995295,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6098.667970999941,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:42:30 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:48:30 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 116.18510289994447,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 104.38908700000866,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:42:30 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:48:30 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 86.63544850060134,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67.73955900007422,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:42:30 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:48:30 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 11.6430866408856,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34.51027741501625,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:42:30 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:48:30 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}",
-            "value": 11.652037467004137,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.03588377959485,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, Mar  7 2024, 18:39:53) [GCC 9.4.0]\",\n    \"torch_version\": \"2.2.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-04-30 05:42:30 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:48:30 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
       {
         "commit": {
           "author": {
-            "name": "Robert Shaw",
-            "username": "robertgshaw2-neuralmagic",
-            "email": "114415538+robertgshaw2-neuralmagic@users.noreply.github.com"
+            "name": "Domenic Barbuzzi",
+            "username": "dbarbuzzi",
+            "email": "dbarbuzzi@gmail.com"
           },
           "committer": {
             "name": "GitHub",
             "username": "web-flow",
             "email": "noreply@github.com"
           },
-          "id": "d485d3e5c9721b27cd0fe345d062fabc038049a1",
-          "message": "Marlin 2:4 Downstream (for v0.3 release) (#239)\n\nSupport marlin 2:4 in downstream so we can have it in the release",
-          "timestamp": "2024-05-14T01:54:06Z",
-          "url": "https://github.com/neuralmagic/nm-vllm/commit/d485d3e5c9721b27cd0fe345d062fabc038049a1"
+          "id": "3a2545670126854a4a685edd889fe68f2fe250c3",
+          "message": "Misc CI/CD updates (#240)",
+          "timestamp": "2024-05-14T14:41:26Z",
+          "url": "https://github.com/neuralmagic/nm-vllm/commit/3a2545670126854a4a685edd889fe68f2fe250c3"
         },
-        "date": 1715671992207,
+        "date": 1715759306117,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 7699.20140399995,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:31:20 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6101.23209849985,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:02:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 155.60859181600426,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 103.81138589331081,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:31:20 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:02:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 126.44680549999521,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 64.00526649986205,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:31:20 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:02:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 57.35804097393765,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34.55243954826612,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:31:20 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:02:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 57.06640501978451,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.05923950958349,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:31:20 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:02:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 14952.524137000182,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7680.819542500103,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 04:10:54 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 02:45:48 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 213.97269461333295,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 155.6045172239992,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 04:10:54 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 02:45:48 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 170.41449899988947,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 130.96245799999906,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 04:10:54 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 02:45:48 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 121.74922727076351,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 57.10173666541715,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 04:10:54 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 02:45:48 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 127.51590181598549,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 56.79671118557445,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 04:10:54 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 02:45:48 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3631.2452459997075,
+            "value": 3649.93440049966,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:20:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 04:35:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 143.76172412334503,
+            "value": 142.4122476466558,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:20:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 04:35:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 95.35134399993694,
+            "value": 92.2783384999093,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:20:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 04:35:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.310515078398957,
+            "value": 24.36573514002923,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:20:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 04:35:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 23.651293949528196,
+            "value": 23.69197726093354,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:20:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 04:35:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5856.753418500375,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 254531.2522874997,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:57:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:33:40 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 118.7610248973045,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 240153.88880120867,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:57:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:33:40 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 89.27816299910774,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 241515.4948259999,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:57:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:33:40 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 53.309391464699814,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69.14310567244951,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:57:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:33:40 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 46.510393959855044,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.18372669169628,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:57:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:33:40 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16020.202677499583,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6043.307231999961,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 05:04:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 02:39:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 253.06591161863494,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 126.88578150666443,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 05:04:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 02:39:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 155.56535249925219,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 82.40760800003955,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 05:04:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 02:39:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 146.14549104973028,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.35962738190891,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 05:04:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 02:39:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 134.71175487648534,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.23074240547815,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 05:04:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 02:39:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2412.483122499907,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6041.992879999725,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:37:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:10:33 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 116.41906162533633,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 121.46525663999304,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:37:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:10:33 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 83.04159249973964,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.24388900003032,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:37:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:10:33 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.954717798726882,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.55438731048751,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:37:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:10:33 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.191531535676557,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.16477230530249,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:37:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:10:33 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1928.175570500116,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1942.0991919996595,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:58:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:40:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 95.48966796332327,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 78.98660993330243,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:58:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:40:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.23218350042225,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37.880631999996695,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:58:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:40:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.258276658293926,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.054870860215237,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:58:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:40:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.51521060171141,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.121548043603779,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:58:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:40:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6352.16404099998,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5049.914612500288,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:18:22 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 05:03:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 110.16861786666747,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 90.22882873998849,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:18:22 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 05:03:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.68160549998947,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 62.91983600021922,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:18:22 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 05:03:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.22538058499631,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.16473733373438,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:18:22 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 05:03:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.91675834190811,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 30.799053717961876,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:18:22 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 05:03:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1940.8526524998706,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59942.670705500066,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:25:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 02:53:57 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 79.21267474664396,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38040.75315246733,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:25:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 02:53:57 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.89376600020478,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37545.920685000056,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:25:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 02:53:57 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.089562012528129,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 97.56689999797372,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:25:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 02:53:57 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.173363486138213,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 100.25388810633912,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:25:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 02:53:57 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 63194.19884299987,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2411.809580999943,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:40:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:52:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 27375.426378377993,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 115.5320580693345,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:40:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:52:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 25736.021327001254,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 79.90481249953518,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:40:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:52:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 185.86486330113644,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.94385816744697,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:40:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:52:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 197.56442768654986,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.169444361253195,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:40:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:52:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 75939.70493999995,
+            "value": 76860.2623894999,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:04:17 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:19:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 56341.681981972004,
+            "value": 57203.97092948268,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:04:17 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:19:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69316.22169849992,
+            "value": 70173.28971250003,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:04:17 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:19:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.55430329170304,
+            "value": 67.93434489798318,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:04:17 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:19:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 64.33734177531761,
+            "value": 64.58440380578197,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:04:17 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:19:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2033.819562499957,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5330.106188499485,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:51:26 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 04:45:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 82.19037280665968,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 182.17156377735228,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:51:26 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 04:45:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38.89878250038237,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 172.8889075002371,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:51:26 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 04:45:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.560121123356186,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.881822445030636,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:51:26 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 04:45:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.691221148500436,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.93948749345439,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:51:26 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 04:45:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2571.6090910000275,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1920.0154015002227,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 04:04:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 04:12:58 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 120.15518762534703,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 95.74568158999682,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 04:04:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 04:12:58 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 81.6943180002454,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 60.970186999838916,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 04:04:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 04:12:58 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19.216760619551575,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.202795011221378,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 04:04:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 04:12:58 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.232633518539803,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.448855716742237,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 04:04:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 04:12:58 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5996.506612500014,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 15703.07831700029,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:24:48 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 05:19:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 127.50568606667495,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 224.3258602553542,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:24:48 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 05:19:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 88.20934350001153,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 150.11075299935328,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:24:48 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 05:19:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.21830872028567,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 144.76176657570113,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:24:48 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 05:19:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.20192526693389,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 132.821358348712,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:24:48 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 05:19:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5082.261738499255,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 14704.967883500103,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:49:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 04:25:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 88.498999799934,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 215.22999095600426,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:49:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 04:25:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 60.69171000035567,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 167.70195599974613,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:49:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 04:25:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.21554315710666,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 117.65424397527357,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:49:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 04:25:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 30.829622089502823,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 122.98717967737052,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:49:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 04:25:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13589.161211999908,
+            "value": 13476.068847999613,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:44:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:59:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 203.4137675746612,
+            "value": 199.25043000799937,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:44:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:59:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 159.45165350012758,
+            "value": 154.84395100020265,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:44:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:59:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 106.80662248274147,
+            "value": 105.13777527980294,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:44:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:59:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 111.48762767348046,
+            "value": 109.68111962814491,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:44:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:59:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 254818.33761899997,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 61124.359150999226,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:18:54 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 04:55:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 241011.8444389933,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 26456.67674058268,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:18:54 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 04:55:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 242041.27989450013,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24940.77680600003,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:18:54 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 04:55:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.88961185379736,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 185.534421332363,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:18:54 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 04:55:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.35204241527413,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 197.434688678055,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:18:54 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 04:55:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 60430.702900000026,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5879.7700890008855,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:39:29 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 05:12:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38304.701958351994,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 118.73317806397729,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:39:29 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 05:12:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37548.55232750003,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 87.43678700011515,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:39:29 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 05:12:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 97.69063523534015,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 53.14737516562541,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:39:29 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 05:12:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 100.23740548629765,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 46.20036354770608,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:39:29 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 05:12:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6053.155197000024,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:55:49 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1818.5470739995253,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:46:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 121.90334264000285,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 92.5462979966657,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:55:49 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:46:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 81.16031500003373,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 56.357280499923945,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:55:49 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:46:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.63900264773282,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.59746519769587,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:55:49 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:46:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.268328228789144,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.822087371719496,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:55:49 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:46:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5320.154192500013,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6362.97378599994,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:30:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 02:33:06 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 181.2350845293695,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 109.44010315333193,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:30:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 02:33:06 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 171.38535149933887,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.66087950004385,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:30:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 02:33:06 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.67783502514967,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.251718120805435,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:30:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 02:33:06 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.79312972752162,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.98496586314468,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-14 04:30:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 02:33:06 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1828.8207245000194,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2032.796726000015,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:31:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 04:06:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 92.21822712335475,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.79107167337497,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:31:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 04:06:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 56.458014999861916,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.870010999948136,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:31:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 04:06:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.639844913203472,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.590794727441683,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:31:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 04:06:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.911983000670325,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.656013017847151,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 03:31:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 04:06:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6098.667970999941,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2563.39438949999,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:48:30 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 04:19:00 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 104.38908700000866,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 119.95549527066639,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:48:30 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 04:19:00 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 67.73955900007422,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 83.83280099997137,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:48:30 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 04:19:00 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34.51027741501625,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19.130742047195096,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:48:30 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 04:19:00 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.03588377959485,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.251971973906564,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-14 02:48:30 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 04:19:00 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
       {
         "commit": {
           "author": {
-            "name": "Domenic Barbuzzi",
-            "username": "dbarbuzzi",
-            "email": "dbarbuzzi@gmail.com"
+            "name": "dhuangnm",
+            "username": "dhuangnm",
+            "email": "74931910+dhuangnm@users.noreply.github.com"
           },
           "committer": {
             "name": "GitHub",
             "username": "web-flow",
             "email": "noreply@github.com"
           },
-          "id": "3a2545670126854a4a685edd889fe68f2fe250c3",
-          "message": "Misc CI/CD updates (#240)",
-          "timestamp": "2024-05-14T14:41:26Z",
-          "url": "https://github.com/neuralmagic/nm-vllm/commit/3a2545670126854a4a685edd889fe68f2fe250c3"
+          "id": "6334dd3bfbd72d30e969bd02edb1280cad5af5a2",
+          "message": "bump version to 0.3.0 (#241)\n\nbump version to 0.3.0",
+          "timestamp": "2024-05-15T13:23:12Z",
+          "url": "https://github.com/neuralmagic/nm-vllm/commit/6334dd3bfbd72d30e969bd02edb1280cad5af5a2"
         },
-        "date": 1715759306117,
+        "date": 1715849787199,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6101.23209849985,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 258118.46169200022,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:02:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:41:21 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 103.81138589331081,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 242367.391415986,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:02:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:41:21 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 64.00526649986205,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 244689.68778350018,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:02:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:41:21 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34.55243954826612,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69.69259278402636,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:02:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:41:21 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.05923950958349,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.36323963001757,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:02:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:41:21 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 7680.819542500103,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 3638.984193500164,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 02:45:48 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 05:42:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 155.6045172239992,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 141.95815483667502,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 02:45:48 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 05:42:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 130.96245799999906,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 93.62860549981633,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 02:45:48 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 05:42:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 57.10173666541715,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.342034556891115,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 02:45:48 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 05:42:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 56.79671118557445,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 23.64824972562455,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 02:45:48 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 05:42:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3649.93440049966,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59371.91879149998,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 04:35:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:01:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 142.4122476466558,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37311.768499666,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 04:35:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:01:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 92.2783384999093,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36652.779958999985,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 04:35:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:01:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.36573514002923,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 97.33364648464519,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 04:35:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:01:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 23.69197726093354,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 99.55867090197547,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 04:35:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:01:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 254531.2522874997,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13685.08536049967,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:33:40 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:07:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 240153.88880120867,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 200.8336691906637,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:33:40 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:07:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 241515.4948259999,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 158.36061550044178,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:33:40 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:07:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69.14310567244951,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 107.19594483218475,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:33:40 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:07:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.18372669169628,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 112.39465867675736,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:33:40 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:07:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6043.307231999961,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1819.5840399998815,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 02:39:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:54:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 126.88578150666443,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 92.42182694997305,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 02:39:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:54:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 82.40760800003955,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.673415999692224,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 02:39:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:54:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.35962738190891,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.590435891314716,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 02:39:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:54:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.23074240547815,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.866962514121967,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 02:39:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:54:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6041.992879999725,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6361.116683499972,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:10:33 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 03:40:02 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 121.46525663999304,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 110.36964258666406,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:10:33 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 03:40:02 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.24388900003032,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 70.05765199994585,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:10:33 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 03:40:02 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.55438731048751,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.20724675453855,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:10:33 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 03:40:02 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.16477230530249,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.91689382031487,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:10:33 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 03:40:02 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1942.0991919996595,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5338.608900499821,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:40:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 05:52:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 78.98660993330243,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 183.17088898930766,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:40:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 05:52:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37.880631999996695,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 175.92986100044072,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:40:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 05:52:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.054870860215237,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.62952981019738,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:40:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 05:52:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.121548043603779,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.958026242584666,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:40:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 05:52:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5049.914612500288,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6099.107161499887,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 05:03:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:10:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 90.22882873998849,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 105.43499850663466,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 05:03:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:10:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 62.91983600021922,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69.7180684999239,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 05:03:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:10:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.16473733373438,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34.54700459908418,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 05:03:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:10:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 30.799053717961876,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.045846241846135,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 05:03:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:10:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59942.670705500066,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 63258.93250450008,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 02:53:57 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 06:03:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38040.75315246733,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 27231.074444095368,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 02:53:57 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 06:03:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37545.920685000056,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 25326.550558000235,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 02:53:57 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 06:03:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 97.56689999797372,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 186.14266411794483,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 02:53:57 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 06:03:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 100.25388810633912,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 197.1091215325504,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 02:53:57 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 06:03:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2411.809580999943,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2029.6977950006294,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:52:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:14:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 115.5320580693345,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.6466327933352,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:52:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:14:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 79.90481249953518,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 42.70703700012746,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:52:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:14:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.94385816744697,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.599106473887705,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:52:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:14:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.169444361253195,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.677298393022797,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:52:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:14:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 76860.2623894999,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2394.628295000075,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:19:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:00:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 57203.97092948268,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 116.39873904799848,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:19:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:00:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 70173.28971250003,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 81.23161299999992,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:19:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:00:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 67.93434489798318,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.90703764958482,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:19:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:00:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 64.58440380578197,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.11724712321176,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:19:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:00:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5330.106188499485,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 14672.161679499823,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 04:45:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:33:23 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 182.17156377735228,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 213.41412069200305,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 04:45:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:33:23 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 172.8889075002371,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 174.149260500144,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 04:45:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:33:23 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.881822445030636,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 117.7828334274791,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 04:45:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:33:23 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.93948749345439,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 123.60739112711852,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 04:45:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:33:23 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1920.0154015002227,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5051.6441065001345,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 04:12:58 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 06:11:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 95.74568158999682,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 91.2713155733339,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 04:12:58 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 06:11:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 60.970186999838916,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 63.221417499335075,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 04:12:58 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 06:11:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.202795011221378,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.14609481401333,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 04:12:58 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 06:11:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.448855716742237,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 30.811671150266346,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 04:12:58 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 06:11:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 15703.07831700029,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7640.926798500004,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 05:19:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 03:53:01 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 224.3258602553542,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 155.8186790013333,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 05:19:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 03:53:01 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 150.11075299935328,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 129.10266649998903,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 05:19:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 03:53:01 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 144.76176657570113,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 56.98906212683416,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 05:19:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 03:53:01 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 132.821358348712,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 56.57185544198197,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 05:19:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 03:53:01 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 14704.967883500103,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1921.5456125002675,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 04:25:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:20:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 215.22999095600426,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 95.37519164667174,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 04:25:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:20:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 167.70195599974613,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59.093453000059526,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 04:25:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:20:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 117.65424397527357,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.16933657139751,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 04:25:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:20:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 122.98717967737052,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.475328412495637,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 04:25:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:20:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13476.068847999613,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 15725.322180499461,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:59:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 06:27:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 199.25043000799937,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 228.28753929002778,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:59:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 06:27:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 154.84395100020265,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 147.56340050007566,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:59:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 06:27:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 105.13777527980294,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 145.2820009003947,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:59:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 06:27:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 109.68111962814491,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 133.30119400852809,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:59:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 06:27:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 61124.359150999226,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5888.8233325005785,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 04:55:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 06:20:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 26456.67674058268,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 118.89838669599703,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 04:55:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 06:20:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24940.77680600003,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 86.66479900057311,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 04:55:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 06:20:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 185.534421332363,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 53.12787558749585,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 04:55:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 06:20:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 197.434688678055,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 46.272510246963996,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 04:55:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 06:20:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5879.7700890008855,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2589.057005000086,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 05:12:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:26:43 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 118.73317806397729,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 119.58779895730913,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 05:12:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:26:43 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 87.43678700011515,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 83.76591749993167,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 05:12:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:26:43 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 53.14737516562541,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19.13812961430733,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 05:12:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:26:43 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 46.20036354770608,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.174284445696333,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-15 05:12:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:26:43 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1818.5470739995253,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6056.139924999969,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:46:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 03:46:28 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 92.5462979966657,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 128.55583270999773,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:46:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 03:46:28 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 56.357280499923945,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 90.00014349999219,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:46:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 03:46:28 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.59746519769587,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.448909689841884,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:46:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 03:46:28 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.822087371719496,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.259250252047636,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 03:46:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 03:46:28 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6362.97378599994,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6027.659153499826,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 02:33:06 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:17:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 109.44010315333193,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 121.16549665333878,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 02:33:06 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:17:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.66087950004385,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 77.90766700009044,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 02:33:06 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:17:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.251718120805435,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.63301758757746,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 02:33:06 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:17:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.98496586314468,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.29818135093642,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 02:33:06 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:17:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2032.796726000015,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 75949.34716449984,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 04:06:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:26:28 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.79107167337497,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 56493.18321569867,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 04:06:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:26:28 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.870010999948136,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69423.33838600008,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 04:06:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:26:28 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.590794727441683,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.62640358472382,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 04:06:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:26:28 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.656013017847151,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 64.55388043976367,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 04:06:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:26:28 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2563.39438949999,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1937.9856095001742,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 04:19:00 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:48:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 119.95549527066639,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 79.23790262007363,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 04:19:00 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:48:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 83.83280099997137,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 43.146204000549915,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 04:19:00 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:48:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19.130742047195096,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.083233474550786,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 04:19:00 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:48:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.251971973906564,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.149322394807717,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.2.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-15 04:19:00 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:48:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
       {
         "commit": {
           "author": {
-            "name": "dhuangnm",
-            "username": "dhuangnm",
-            "email": "74931910+dhuangnm@users.noreply.github.com"
+            "name": "Michael Goin",
+            "username": "mgoin",
+            "email": "michael@neuralmagic.com"
           },
           "committer": {
             "name": "GitHub",
             "username": "web-flow",
             "email": "noreply@github.com"
           },
-          "id": "6334dd3bfbd72d30e969bd02edb1280cad5af5a2",
-          "message": "bump version to 0.3.0 (#241)\n\nbump version to 0.3.0",
-          "timestamp": "2024-05-15T13:23:12Z",
-          "url": "https://github.com/neuralmagic/nm-vllm/commit/6334dd3bfbd72d30e969bd02edb1280cad5af5a2"
+          "id": "59cf939c70173f2419d143a738505180c37465a4",
+          "message": "Add FP8 and marlin 2:4 tests for lm-eval (#244)",
+          "timestamp": "2024-05-16T20:43:49Z",
+          "url": "https://github.com/neuralmagic/nm-vllm/commit/59cf939c70173f2419d143a738505180c37465a4"
         },
-        "date": 1715849787199,
+        "date": 1715931046658,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 258118.46169200022,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6168.269709499782,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:41:21 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:44:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 242367.391415986,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 105.98510685333773,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:41:21 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:44:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 244689.68778350018,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.63699749987973,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:41:21 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:44:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69.69259278402636,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34.674676269909696,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:41:21 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:44:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.36323963001757,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.19037892508264,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:41:21 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:44:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 15854.127336500824,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 05:00:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 229.12089890001153,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 05:00:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 155.03108400025667,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 05:00:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 145.8418386060145,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 05:00:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 132.76484605535293,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 05:00:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3638.984193500164,
+            "value": 3638.4393895000358,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 05:42:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:16:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 141.95815483667502,
+            "value": 143.43216443332494,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 05:42:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:16:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 93.62860549981633,
+            "value": 91.1189015000673,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 05:42:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:16:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.342034556891115,
+            "value": 24.13681476256774,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 05:42:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:16:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 23.64824972562455,
+            "value": 23.569469998782356,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 05:42:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:16:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59371.91879149998,
+            "value": 59962.44221149993,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:01:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:35:32 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37311.768499666,
+            "value": 37740.73947849,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:01:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:35:32 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36652.779958999985,
+            "value": 37190.24467849999,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:01:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:35:32 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 97.33364648464519,
+            "value": 97.62116391947876,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:01:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:35:32 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 99.55867090197547,
+            "value": 99.52574322834293,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:01:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:35:32 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13685.08536049967,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6363.175155999954,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:07:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:14:10 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 200.8336691906637,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 109.46530145999759,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:07:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:14:10 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 158.36061550044178,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67.22012699998459,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:07:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:14:10 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 107.19594483218475,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.18218308004019,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:07:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:14:10 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 112.39465867675736,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.90844879031058,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:07:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:14:10 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1819.5840399998815,
+            "value": 1829.1798820000622,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:54:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:27:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 92.42182694997305,
+            "value": 92.8089847133136,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:54:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:27:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.673415999692224,
+            "value": 55.889288499656686,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:54:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:27:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.590435891314716,
+            "value": 12.61356858195317,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:54:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:27:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.866962514121967,
+            "value": 11.88905354923614,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:54:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:27:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6361.116683499972,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 254069.14275899975,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 03:40:02 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:14:59 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 110.36964258666406,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 240695.88777094072,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 03:40:02 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:14:59 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 70.05765199994585,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 240912.15783250003,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 03:40:02 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:14:59 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.20724675453855,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69.65105717837609,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 03:40:02 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:14:59 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.91689382031487,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.43780510097852,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 03:40:02 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:14:59 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5338.608900499821,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6011.775941000053,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 05:52:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:20:21 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 183.17088898930766,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 128.2050380700025,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 05:52:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:20:21 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 175.92986100044072,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 88.94614950008872,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 05:52:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:20:21 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.62952981019738,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.3468362321537,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 05:52:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:20:21 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.958026242584666,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.33262726330132,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 05:52:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:20:21 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6099.107161499887,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:10:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2410.191408999708,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:33:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 105.43499850663466,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 115.48227018665541,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:10:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:33:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69.7180684999239,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 81.24950499995975,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:10:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:33:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34.54700459908418,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.93705259664011,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:10:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:33:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.045846241846135,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.160367230376785,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:10:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:33:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 63258.93250450008,
+            "value": 63109.12443750021,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 06:03:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:36:42 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 27231.074444095368,
+            "value": 27099.457985118017,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 06:03:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:36:42 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 25326.550558000235,
+            "value": 24794.5113629994,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 06:03:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:36:42 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 186.14266411794483,
+            "value": 186.17630149679047,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 06:03:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:36:42 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 197.1091215325504,
+            "value": 197.59613802394037,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 06:03:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:36:42 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2029.6977950006294,
+            "value": 2033.1724739999117,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:14:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:47:32 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.6466327933352,
+            "value": 81.60828334669229,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:14:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:47:32 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 42.70703700012746,
+            "value": 39.80177150015152,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:14:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:47:32 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.599106473887705,
+            "value": 11.668408356669811,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:14:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:47:32 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.677298393022797,
+            "value": 11.696979096307233,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:14:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:47:32 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2394.628295000075,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5098.723483000867,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:00:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:44:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 116.39873904799848,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 90.27253446010339,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:00:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:44:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 81.23161299999992,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 60.85428400001547,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:00:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:44:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.90703764958482,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.22974063171822,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:00:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:44:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.11724712321176,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 30.82210335211133,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:00:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:44:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 14672.161679499823,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1938.1165569998302,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:33:23 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:21:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 213.41412069200305,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 79.63462712665812,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:33:23 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:21:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 174.149260500144,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.184574000249995,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:33:23 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:21:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 117.7828334274791,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.062266717402675,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:33:23 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:21:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 123.60739112711852,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.160580846799421,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:33:23 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:21:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5051.6441065001345,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13524.57672249966,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 06:11:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:40:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 91.2713155733339,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 198.33103487866478,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 06:11:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:40:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 63.221417499335075,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 156.28295849955975,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 06:11:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:40:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.14609481401333,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 106.35815688233247,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 06:11:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:40:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 30.811671150266346,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 111.10263256790182,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 06:11:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:40:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 7640.926798500004,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5304.019143500227,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 03:53:01 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:26:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 155.8186790013333,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 181.5558509067002,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 03:53:01 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:26:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 129.10266649998903,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 173.9265964997685,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 03:53:01 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:26:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 56.98906212683416,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.72565584426561,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 03:53:01 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:26:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 56.57185544198197,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.897744546202475,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 03:53:01 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:26:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1921.5456125002675,
+            "value": 1921.0260259997085,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:20:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:54:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 95.37519164667174,
+            "value": 95.76962023666056,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:20:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:54:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59.093453000059526,
+            "value": 58.09594550009933,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:20:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:54:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.16933657139751,
+            "value": 13.251560110156246,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:20:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:54:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.475328412495637,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:20:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 15725.322180499461,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 06:27:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 228.28753929002778,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 06:27:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 147.56340050007566,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 06:27:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 145.2820009003947,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 06:27:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 133.30119400852809,
+            "value": 12.549780864286399,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 06:27:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:54:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5888.8233325005785,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 76498.90343950005,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 06:20:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:00:23 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 118.89838669599703,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 56876.460692046676,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 06:20:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:00:23 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 86.66479900057311,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69992.24383849991,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 06:20:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:00:23 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 53.12787558749585,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67.16335240767877,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 06:20:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:00:23 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 46.272510246963996,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 64.58964462050396,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-16 06:20:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:00:23 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2589.057005000086,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7698.409701499941,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:26:43 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:27:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 119.58779895730913,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 156.52133288799632,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:26:43 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:27:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 83.76591749993167,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 127.80558999997993,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:26:43 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:27:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19.13812961430733,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 57.29115957646765,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:26:43 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:27:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.174284445696333,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 57.01315399741855,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 05:26:43 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:27:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6056.139924999969,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 14792.707703500128,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 03:46:28 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 04:06:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 128.55583270999773,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 218.14901325201268,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 03:46:28 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 04:06:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 90.00014349999219,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 172.01783900009104,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 03:46:28 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 04:06:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.448909689841884,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 119.7628959969548,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 03:46:28 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 04:06:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.259250252047636,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 125.36463364552219,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 03:46:28 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 04:06:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6027.659153499826,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2575.180585000453,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:17:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 04:00:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 121.16549665333878,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 120.64534286131433,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:17:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 04:00:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 77.90766700009044,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 84.11354049985675,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:17:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 04:00:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.63301758757746,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19.257292271003042,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:17:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 04:00:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.29818135093642,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.321459875944225,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:17:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 04:00:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 75949.34716449984,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5891.935765999733,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:26:28 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:53:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 56493.18321569867,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 119.47207699864036,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:26:28 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:53:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69423.33838600008,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 88.99214000030042,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:26:28 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:53:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.62640358472382,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 53.53983529779377,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:26:28 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:53:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 64.55388043976367,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 46.422655616102965,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:26:28 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:53:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1937.9856095001742,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6054.575040000145,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:48:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:51:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 79.23790262007363,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 120.50609336336493,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:48:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:51:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 43.146204000549915,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 79.48204099989198,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:48:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:51:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.083233474550786,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.69021046032865,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:48:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:51:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.149322394807717,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.28364630012998,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-16 04:48:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:51:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
       {
         "commit": {
           "author": {
-            "name": "Michael Goin",
-            "username": "mgoin",
-            "email": "michael@neuralmagic.com"
+            "name": "Andy Linfoot",
+            "username": "andy-neuma",
+            "email": "78757007+andy-neuma@users.noreply.github.com"
           },
           "committer": {
             "name": "GitHub",
             "username": "web-flow",
             "email": "noreply@github.com"
           },
-          "id": "59cf939c70173f2419d143a738505180c37465a4",
-          "message": "Add FP8 and marlin 2:4 tests for lm-eval (#244)",
-          "timestamp": "2024-05-16T20:43:49Z",
-          "url": "https://github.com/neuralmagic/nm-vllm/commit/59cf939c70173f2419d143a738505180c37465a4"
+          "id": "d69a34a3194efb8dd34c1f293af91aac19b5b992",
+          "message": "updates for nm-magic-wand, nightly or release (#247)\n\nSUMMARY:\r\n* update GHA action `nm-build-vllm` to not install\r\n`nm-magic-wand-nightly`\r\n* update build script to not install `nm-magic-wand-nightly` (we might\r\nconsider getting rid of this script altogether, since we aren't really\r\nusing it)\r\n* remove unused GHA action `nm-test-vllm`. this has been superseded by\r\n`nm-install-test-whl`\r\n* update GHA action `nm-install-test-whl` to get version of\r\n`nm-magic-wand` if `nm-magic-wand-nightly` is not present\r\n* update `setup.py` to default generate \"nightly\" package and add option\r\nbased on ENV to generate release package. this also includes managing\r\nthe dependency on `nm-magic-wand`.\r\n* update `set-env` action to set ENV based on `wf_category` input\r\n* update \"release\" workflow to include all supported python versions\r\n* delete obsolete \"gen-whl\"\r\n\r\nNOTES:\r\n- \"magic-wand\" is only a runtime dependency, so no need to install it\r\nduring build phase.\r\n- this PR makes it so that we by default generate a \"nightly\" package\r\nwith a \"nightly\" version number. if we want to generate a release\r\npackage we'll need to specify `wf_category` as `RELEASE`.\r\n\r\nTEST PLAN:\r\nruns on remote push. verifying that `wf_category` set to `RELEASE` will\r\ngenerate appropriate package.\r\n\r\nran \"build\" workflow with `wf_category` set to `RELEASE` ... package\r\nlooks properly named and versioned ...\r\nhttps://github.com/neuralmagic/nm-vllm/actions/runs/9129675592\r\n\r\nthe \"remote push\" defaulted to generating a \"nightly\" package ... please\r\nsee ... https://github.com/neuralmagic/nm-vllm/actions/runs/9129665988\r\n\r\n---------\r\n\r\nCo-authored-by: andy-neuma <andy@neuralmagic.com>",
+          "timestamp": "2024-05-17T18:21:50Z",
+          "url": "https://github.com/neuralmagic/nm-vllm/commit/d69a34a3194efb8dd34c1f293af91aac19b5b992"
         },
-        "date": 1715931046658,
+        "date": 1716017339879,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6168.269709499782,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 14810.016069499852,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:44:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 04:07:09 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 105.98510685333773,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 218.56008913000855,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:44:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 04:07:09 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.63699749987973,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 173.22120100016036,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:44:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 04:07:09 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34.674676269909696,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 120.2988079861909,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:44:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 04:07:09 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.19037892508264,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 125.42463874072129,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:44:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 04:07:09 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 15854.127336500824,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6047.7593895000155,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 05:00:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:52:01 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 229.12089890001153,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 120.70249681000102,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 05:00:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:52:01 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 155.03108400025667,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 84.60992200002693,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 05:00:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:52:01 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 145.8418386060145,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.64234790472506,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 05:00:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:52:01 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 132.76484605535293,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.30521584813457,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 05:00:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:52:01 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3638.4393895000358,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13637.514877500507,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:16:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:40:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 143.43216443332494,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 199.83664511266275,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:16:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:40:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 91.1189015000673,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 157.96504800027833,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:16:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:40:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.13681476256774,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 106.6839072689676,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:16:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:40:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 23.569469998782356,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 111.64681703876947,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:16:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:40:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59962.44221149993,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6361.296463499968,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:35:32 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:14:37 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37740.73947849,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 109.71057482666917,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:35:32 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:14:37 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37190.24467849999,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69.16868700000123,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:35:32 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:14:37 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 97.62116391947876,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.19780760871951,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:35:32 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:14:37 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 99.52574322834293,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.92559944563227,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:35:32 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:14:37 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6363.175155999954,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59489.59240800002,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:14:10 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:35:42 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 109.46530145999759,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37368.98345445866,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:14:10 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:35:42 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 67.22012699998459,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37067.69261349996,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:14:10 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:35:42 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.18218308004019,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 97.26940382136384,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:14:10 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:35:42 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.90844879031058,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 99.56675985177498,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:14:10 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:35:42 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1829.1798820000622,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 62056.8363225002,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:27:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:36:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 92.8089847133136,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 26855.629502783344,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:27:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:36:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 55.889288499656686,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 25377.36457000028,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:27:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:36:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.61356858195317,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 185.2058512415153,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:27:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:36:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.88905354923614,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 196.33672283094916,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:27:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:36:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 254069.14275899975,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2027.6393539998026,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:14:59 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:47:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 240695.88777094072,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.96425516666082,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:14:59 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:47:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 240912.15783250003,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.60456700017312,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:14:59 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:47:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69.65105717837609,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.566972347454207,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:14:59 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:47:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.43780510097852,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.650799858896228,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:14:59 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:47:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6011.775941000053,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5088.906428999508,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:20:21 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:45:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 128.2050380700025,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 92.13272269999531,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:20:21 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:45:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 88.94614950008872,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 64.99242449990561,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:20:21 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:45:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.3468362321537,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.22368878213732,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:20:21 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:45:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.33262726330132,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 30.838412958535734,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:20:21 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:45:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2410.191408999708,
+            "value": 2443.2510500000717,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:33:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:34:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 115.48227018665541,
+            "value": 115.30383123866584,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:33:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:34:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 81.24950499995975,
+            "value": 80.4688204998456,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:33:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:34:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.93705259664011,
+            "value": 18.031387206200954,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:33:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:34:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.160367230376785,
+            "value": 16.145742038947475,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:33:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:34:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 63109.12443750021,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 254335.7819769999,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:36:42 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:15:07 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 27099.457985118017,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 239884.11322115734,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:36:42 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:15:07 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24794.5113629994,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 241095.04781499982,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:36:42 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:15:07 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 186.17630149679047,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.82444884384628,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:36:42 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:15:07 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 197.59613802394037,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65.901508156746,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:36:42 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:15:07 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2033.1724739999117,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6105.448188500077,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:47:32 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:44:43 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 81.60828334669229,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 105.28023722000701,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:47:32 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:44:43 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.80177150015152,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69.17037599987452,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:47:32 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:44:43 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.668408356669811,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34.56989417257731,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:47:32 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:44:43 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.696979096307233,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.099051351825665,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:47:32 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:44:43 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5098.723483000867,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 3638.254859999961,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:44:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:16:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 90.27253446010339,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 142.63412806328537,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:44:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:16:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 60.85428400001547,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 93.39277999970363,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:44:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:16:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.22974063171822,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.245417164670172,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:44:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:16:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 30.82210335211133,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 23.581465649564212,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:44:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:16:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1938.1165569998302,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 15878.910925000127,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:21:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 05:00:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 79.63462712665812,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 257.79683584202337,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:21:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 05:00:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.184574000249995,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 158.93650999987585,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:21:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 05:00:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.062266717402675,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 146.3690193086413,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:21:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 05:00:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.160580846799421,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 134.16699667579783,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:21:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 05:00:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13524.57672249966,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5263.1742175008185,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:40:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:26:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 198.33103487866478,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 182.06313942133661,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:40:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:26:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 156.28295849955975,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 171.37223499958054,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:40:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:26:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 106.35815688233247,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.361488833984794,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:40:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:26:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 111.10263256790182,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.77556198029057,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:40:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:26:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5304.019143500227,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2584.6181110000543,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:26:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 04:00:28 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 181.5558509067002,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 121.38421627068116,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:26:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 04:00:28 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 173.9265964997685,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 83.61264300037874,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:26:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 04:00:28 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.72565584426561,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19.207850582925126,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:26:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 04:00:28 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.897744546202475,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.259956566071594,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:26:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 04:00:28 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1921.0260259997085,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1824.3145765000008,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:54:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:28:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 95.76962023666056,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 92.51873117000287,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:54:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:28:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.09594550009933,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 56.77562600021702,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:54:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:28:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.251560110156246,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.61187830148239,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:54:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:28:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.549780864286399,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.88771671636012,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:54:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:28:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 76498.90343950005,
+            "value": 76891.4737555001,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:00:23 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:00:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 56876.460692046676,
+            "value": 57436.928021955995,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:00:23 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:00:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69992.24383849991,
+            "value": 70402.43751549997,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:00:23 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:00:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 67.16335240767877,
+            "value": 66.47794981294955,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:00:23 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:00:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 64.58964462050396,
+            "value": 65.12885841910457,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 03:00:23 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:00:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 7698.409701499941,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1922.217876999639,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:27:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:54:26 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 156.52133288799632,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 95.57471203667167,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:27:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:54:26 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 127.80558999997993,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 61.548126000161574,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:27:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:54:26 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 57.29115957646765,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.195607250386715,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:27:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:54:26 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 57.01315399741855,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.492856398193746,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:27:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:54:26 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 14792.707703500128,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1940.8474389997536,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 04:06:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:22:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 218.14901325201268,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 79.07058909334714,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 04:06:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:22:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 172.01783900009104,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38.778589500452654,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 04:06:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:22:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 119.7628959969548,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.09075655858916,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 04:06:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:22:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 125.36463364552219,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.185356074012214,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 04:06:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:22:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2575.180585000453,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5820.976581500872,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 04:00:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:53:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 120.64534286131433,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 118.63150633461433,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 04:00:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:53:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 84.11354049985675,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 88.11294800034375,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 04:00:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:53:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19.257292271003042,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 53.02006540933792,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 04:00:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:53:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.321459875944225,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 46.0881008061641,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 04:00:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:53:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5891.935765999733,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7682.707939000011,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:53:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:27:34 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 119.47207699864036,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:53:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 155.30934739999762,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:27:34 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 88.99214000030042,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 126.49294300001657,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:53:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:27:34 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 53.53983529779377,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 57.15492825600079,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:53:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:27:34 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 46.422655616102965,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 56.823160924864695,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-17 04:53:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:27:34 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6054.575040000145,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6018.421747500042,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:51:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:21:02 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 120.50609336336493,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 125.63125606333186,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:51:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:21:02 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 79.48204099989198,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.89627549998113,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:51:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:21:02 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.69021046032865,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.390901221479396,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:51:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:21:02 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.28364630012998,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.30476443296476,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-17 02:51:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:21:02 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
@@ -73766,668 +73766,668 @@ window.BENCHMARK_DATA = {
           "timestamp": "2024-05-17T18:21:50Z",
           "url": "https://github.com/neuralmagic/nm-vllm/commit/d69a34a3194efb8dd34c1f293af91aac19b5b992"
         },
-        "date": 1716017339879,
+        "date": 1716104023513,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 14810.016069499852,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 61655.41962700081,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 04:07:09 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:42:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 218.56008913000855,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 26528.746806866013,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 04:07:09 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:42:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 173.22120100016036,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24690.114835500026,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 04:07:09 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:42:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 120.2988079861909,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 184.86608919589457,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 04:07:09 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:42:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 125.42463874072129,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 196.17474513994708,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 04:07:09 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:42:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6047.7593895000155,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6044.677653000008,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:52:01 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:26:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 120.70249681000102,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 125.03815819000427,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:52:01 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:26:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 84.60992200002693,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 81.75302800009376,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:52:01 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:26:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.64234790472506,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.424346910297224,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:52:01 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:26:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.30521584813457,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.21783580931318,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:52:01 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:26:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13637.514877500507,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1921.6672434999964,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:40:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:59:39 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 199.83664511266275,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 95.78780205332501,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:40:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:59:39 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 157.96504800027833,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 61.91459850015235,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:40:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:59:39 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 106.6839072689676,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.26898181958198,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:40:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:59:39 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 111.64681703876947,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.542263680108311,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:40:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:59:39 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6361.296463499968,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6127.92181750001,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:14:37 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:49:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 109.71057482666917,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 105.16290845333363,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:14:37 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:49:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69.16868700000123,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67.89771300009306,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:14:37 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:49:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.19780760871951,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34.56511063377084,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:14:37 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:49:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.92559944563227,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.08658959372397,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:14:37 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:49:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59489.59240800002,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1824.861421499918,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:35:42 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:33:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37368.98345445866,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 92.41041380330292,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:35:42 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:33:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37067.69261349996,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.37233400052355,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:35:42 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:33:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 97.26940382136384,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.626507907901109,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:35:42 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:33:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 99.56675985177498,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.886236209687796,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:35:42 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:33:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 62056.8363225002,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 3635.1900660001775,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:36:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:21:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 26855.629502783344,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 143.38315672996941,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:36:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:21:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 25377.36457000028,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 95.76733099993362,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:36:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:21:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 185.2058512415153,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.280647590190572,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:36:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:21:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 196.33672283094916,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 23.582746199784975,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:36:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:21:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2027.6393539998026,
+            "value": 2033.8978135005163,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:47:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:52:55 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.96425516666082,
+            "value": 80.8639452733102,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:47:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:52:55 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.60456700017312,
+            "value": 39.51642699985314,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:47:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:52:55 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.566972347454207,
+            "value": 11.673895722857624,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:47:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:52:55 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.650799858896228,
+            "value": 11.69115451814601,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:47:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:52:55 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5088.906428999508,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5938.327654000204,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:45:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:58:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 92.13272269999531,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 119.85603009857974,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:45:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:58:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 64.99242449990561,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 89.8111569995308,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:45:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:58:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.22368878213732,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 53.32193032637963,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:45:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:58:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 30.838412958535734,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 46.33472051666922,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:45:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:58:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2443.2510500000717,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5083.483950499613,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:34:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:50:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 115.30383123866584,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 90.46125785995778,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:34:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:50:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.4688204998456,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 63.44288049967872,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:34:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:50:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18.031387206200954,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.223061952446045,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:34:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:50:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.145742038947475,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 30.817517198938685,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:34:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:50:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 254335.7819769999,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6368.015514000035,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:15:07 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:19:52 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 239884.11322115734,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 111.74491108000059,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:15:07 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:19:52 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 241095.04781499982,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 70.00241100001858,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:15:07 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:19:52 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.82444884384628,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.20411600305019,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:15:07 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:19:52 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65.901508156746,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.93004425265935,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:15:07 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:19:52 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6105.448188500077,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 14781.194467000205,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:44:43 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 04:12:22 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 105.28023722000701,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 219.56516076601233,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:44:43 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 04:12:22 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69.17037599987452,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 170.35813100028463,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:44:43 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 04:12:22 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34.56989417257731,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 119.4530258622076,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:44:43 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 04:12:22 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.099051351825665,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 124.81759093661918,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:44:43 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 04:12:22 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3638.254859999961,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16121.083815500242,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:16:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 05:05:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 142.63412806328537,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 275.79642830399223,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:16:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 05:05:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 93.39277999970363,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 158.28274150044308,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:16:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 05:05:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.245417164670172,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 148.05415411786547,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:16:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 05:05:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 23.581465649564212,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 136.22412523832307,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:16:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 05:05:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 15878.910925000127,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2584.5175194999683,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 05:00:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 04:05:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 257.79683584202337,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 119.96530007466815,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 05:00:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 04:05:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 158.93650999987585,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 82.06174400038435,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 05:00:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 04:05:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 146.3690193086413,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19.208495283912516,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 05:00:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 04:05:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 134.16699667579783,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.280756653462735,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 05:00:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 04:05:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5263.1742175008185,
+            "value": 5309.328878999622,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:26:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:31:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 182.06313942133661,
+            "value": 182.55137791065258,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:26:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:31:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 171.37223499958054,
+            "value": 174.64112149991706,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:26:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:31:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.361488833984794,
+            "value": 39.58673908159116,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:26:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:31:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.77556198029057,
+            "value": 36.90731126948433,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:26:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:31:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2584.6181110000543,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7728.100629999972,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 04:00:28 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:32:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 121.38421627068116,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 155.62900196932907,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 04:00:28 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:32:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 83.61264300037874,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 131.0928509999485,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 04:00:28 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:32:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19.207850582925126,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 56.93319605515613,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 04:00:28 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:32:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.259956566071594,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 56.58694876708374,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 04:00:28 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:32:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1824.3145765000008,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 76756.42494399994,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:28:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:05:47 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 92.51873117000287,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 57101.352517459985,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:28:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:05:47 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 56.77562600021702,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69993.83625249992,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:28:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:05:47 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.61187830148239,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.00693326917761,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:28:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:05:47 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.88771671636012,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:28:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 64.64969922823357,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:05:47 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 76891.4737555001,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13629.244735499924,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:00:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:45:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 57436.928021955995,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 205.90700912799548,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:00:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:45:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 70402.43751549997,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 158.40073799972743,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:00:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:45:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.47794981294955,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 106.67080802803109,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:00:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:45:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65.12885841910457,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 111.6921246030719,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:00:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:45:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1922.217876999639,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 255002.6453329999,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:54:26 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:20:21 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 95.57471203667167,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 240243.91990748668,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:54:26 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:20:21 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 61.548126000161574,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 241767.39791549972,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:54:26 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:20:21 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.195607250386715,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69.7344433797077,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:54:26 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:20:21 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.492856398193746,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65.95666295226302,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:54:26 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:20:21 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1940.8474389997536,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2445.613765499729,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:22:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:39:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 79.07058909334714,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 116.16249440534011,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:22:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:39:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38.778589500452654,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 82.27358949989139,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:22:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:39:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.09075655858916,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.96014548999124,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:22:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:39:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.185356074012214,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.162734591422716,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 03:22:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:39:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5820.976581500872,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1941.047039500063,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:53:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:27:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 118.63150633461433,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 79.00893331999518,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:53:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:27:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 88.11294800034375,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.485431000230164,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:53:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:27:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 53.02006540933792,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.129155345630748,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:53:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:27:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 46.0881008061641,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.12759747485139,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-18 04:53:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:27:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 7682.707939000011,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6071.857499000089,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:27:34 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:57:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 155.30934739999762,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 119.62412981668572,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:27:34 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:57:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 126.49294300001657,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.97832849989572,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:27:34 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:57:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 57.15492825600079,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.80621436769108,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:27:34 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:57:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 56.823160924864695,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.397060414084116,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:27:34 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:57:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6018.421747500042,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59540.021137000054,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:21:02 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:40:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 125.63125606333186,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37168.83478003866,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:21:02 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:40:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.89627549998113,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36646.56828200009,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:21:02 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:40:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.390901221479396,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 97.34496673627908,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:21:02 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:40:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.30476443296476,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 99.2032687169866,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-18 02:21:02 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:40:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
@@ -74443,673 +74443,673 @@ window.BENCHMARK_DATA = {
             "username": "web-flow",
             "email": "noreply@github.com"
           },
-          "id": "d69a34a3194efb8dd34c1f293af91aac19b5b992",
-          "message": "updates for nm-magic-wand, nightly or release (#247)\n\nSUMMARY:\r\n* update GHA action `nm-build-vllm` to not install\r\n`nm-magic-wand-nightly`\r\n* update build script to not install `nm-magic-wand-nightly` (we might\r\nconsider getting rid of this script altogether, since we aren't really\r\nusing it)\r\n* remove unused GHA action `nm-test-vllm`. this has been superseded by\r\n`nm-install-test-whl`\r\n* update GHA action `nm-install-test-whl` to get version of\r\n`nm-magic-wand` if `nm-magic-wand-nightly` is not present\r\n* update `setup.py` to default generate \"nightly\" package and add option\r\nbased on ENV to generate release package. this also includes managing\r\nthe dependency on `nm-magic-wand`.\r\n* update `set-env` action to set ENV based on `wf_category` input\r\n* update \"release\" workflow to include all supported python versions\r\n* delete obsolete \"gen-whl\"\r\n\r\nNOTES:\r\n- \"magic-wand\" is only a runtime dependency, so no need to install it\r\nduring build phase.\r\n- this PR makes it so that we by default generate a \"nightly\" package\r\nwith a \"nightly\" version number. if we want to generate a release\r\npackage we'll need to specify `wf_category` as `RELEASE`.\r\n\r\nTEST PLAN:\r\nruns on remote push. verifying that `wf_category` set to `RELEASE` will\r\ngenerate appropriate package.\r\n\r\nran \"build\" workflow with `wf_category` set to `RELEASE` ... package\r\nlooks properly named and versioned ...\r\nhttps://github.com/neuralmagic/nm-vllm/actions/runs/9129675592\r\n\r\nthe \"remote push\" defaulted to generating a \"nightly\" package ... please\r\nsee ... https://github.com/neuralmagic/nm-vllm/actions/runs/9129665988\r\n\r\n---------\r\n\r\nCo-authored-by: andy-neuma <andy@neuralmagic.com>",
-          "timestamp": "2024-05-17T18:21:50Z",
-          "url": "https://github.com/neuralmagic/nm-vllm/commit/d69a34a3194efb8dd34c1f293af91aac19b5b992"
+          "id": "93183d6b21fd1f42fc2a98b93e46fca0c5530b40",
+          "message": "increase timeouts (#253)\n\nSUMMARY:\r\n* increase \"RELEASE\" workflow timeouts to 12 hours\r\n\r\nTEST PLAN:\r\nwill cherry pick and trigger job manually on release, `v0.3.0`\r\n\r\nCo-authored-by: andy-neuma <andy@neuralmagic.com>",
+          "timestamp": "2024-05-20T21:44:29Z",
+          "url": "https://github.com/neuralmagic/nm-vllm/commit/93183d6b21fd1f42fc2a98b93e46fca0c5530b40"
         },
-        "date": 1716104023513,
+        "date": 1716276873695,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 61655.41962700081,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5888.232021000476,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:42:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:58:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 26528.746806866013,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 121.28707645070001,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:42:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:58:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24690.114835500026,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 91.33425299933151,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:42:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:58:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 184.86608919589457,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 53.37600865038578,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:42:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:58:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 196.17474513994708,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 46.240458468117346,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:42:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:58:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6044.677653000008,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1851.7477995001173,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:26:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:34:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 125.03815819000427,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 92.63186055335609,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:26:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:34:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 81.75302800009376,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59.01871750029386,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:26:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:34:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.424346910297224,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.64062083609474,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:26:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:34:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.21783580931318,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.908801976978634,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:26:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:34:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1921.6672434999964,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 62561.73622250026,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:59:39 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:42:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 95.78780205332501,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 26827.57049859066,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:59:39 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:42:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 61.91459850015235,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24709.98807849992,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:59:39 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:42:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.26898181958198,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 186.2852890606116,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:59:39 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:42:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.542263680108311,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 197.76614333508624,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:59:39 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:42:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6127.92181750001,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 76591.36254999999,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:49:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:02:18 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 105.16290845333363,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 56769.70654236267,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:49:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:02:18 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 67.89771300009306,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69590.44582899992,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:49:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:02:18 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34.56511063377084,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.06704888151754,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:49:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:02:18 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.08658959372397,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 64.64701087512805,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:49:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:02:18 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1824.861421499918,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1948.544970999592,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:33:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:28:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 92.41041380330292,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 78.7087402533507,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:33:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:28:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.37233400052355,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.53587200021502,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:33:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:28:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.626507907901109,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.09703730208491,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:33:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:28:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.886236209687796,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.17204182207614,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:33:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:28:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3635.1900660001775,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 15923.402555499706,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:21:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 05:06:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 143.38315672996941,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 245.11491717732738,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:21:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 05:06:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 95.76733099993362,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 159.51965949989244,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:21:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 05:06:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.280647590190572,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 146.06644579933197,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:21:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 05:06:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 23.582746199784975,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 133.87246110308337,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:21:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 05:06:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2033.8978135005163,
+            "value": 2035.2116740000383,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:52:55 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:54:00 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.8639452733102,
+            "value": 81.47769093337047,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:52:55 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:54:00 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.51642699985314,
+            "value": 40.839236000010715,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:52:55 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:54:00 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.673895722857624,
+            "value": 11.604968096061906,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:52:55 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:54:00 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.69115451814601,
+            "value": 11.710429787194647,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:52:55 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:54:00 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5938.327654000204,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6043.739634999952,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:58:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:22:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 119.85603009857974,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 126.0259374199946,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:58:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:22:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 89.8111569995308,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 85.7043535000912,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:58:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:22:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 53.32193032637963,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.47326180115146,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:58:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:22:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 46.33472051666922,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.24005488804747,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:58:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:22:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5083.483950499613,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6097.635424999908,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:50:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:46:14 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 90.46125785995778,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 104.50908731998727,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:50:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:46:14 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 63.44288049967872,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69.17343300005996,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:50:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:46:14 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.223061952446045,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34.57012586129777,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:50:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:46:14 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 30.817517198938685,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.0636363200751,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:50:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:46:14 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6368.015514000035,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59794.577934000015,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:19:52 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:36:57 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 111.74491108000059,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37813.855549205335,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:19:52 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:36:57 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 70.00241100001858,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37292.644641000035,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:19:52 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:36:57 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.20411600305019,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 97.51843281166308,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:19:52 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:36:57 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.93004425265935,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 99.77894709484394,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:19:52 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:36:57 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 14781.194467000205,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5321.007210000062,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 04:12:22 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:32:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 219.56516076601233,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 182.49545777331514,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 04:12:22 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:32:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 170.35813100028463,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 173.38897650006402,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 04:12:22 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:32:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 119.4530258622076,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.54371866717419,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 04:12:22 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:32:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 124.81759093661918,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 04:12:22 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.85150928037011,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:32:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16121.083815500242,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5097.145164000722,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 05:05:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:50:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 275.79642830399223,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 89.19441409999611,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 05:05:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:50:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 158.28274150044308,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 61.55316599961225,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 05:05:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:50:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 148.05415411786547,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.27375442270062,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 05:05:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:50:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 136.22412523832307,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 30.899791572255083,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 05:05:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:50:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2584.5175194999683,
+            "value": 2582.9528250001204,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 04:05:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 04:06:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 119.96530007466815,
+            "value": 121.40620600400871,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 04:05:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 04:06:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 82.06174400038435,
+            "value": 84.30327799987936,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 04:05:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 04:06:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19.208495283912516,
+            "value": 19.327078396834594,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 04:05:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 04:06:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.280756653462735,
+            "value": 17.348156597758088,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 04:05:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 04:06:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5309.328878999622,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7676.953527500018,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:31:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:28:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 182.55137791065258,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 155.45015838399922,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:31:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:28:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 174.64112149991706,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 126.44363349988907,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:31:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:28:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.58673908159116,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 57.14807488799022,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:31:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:28:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.90731126948433,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 56.86473735310059,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-19 04:31:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:28:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 7728.100629999972,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6375.89961599997,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:32:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:14:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 155.62900196932907,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 111.17996672000194,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:32:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:14:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 131.0928509999485,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 70.82110750002357,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:32:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:14:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 56.93319605515613,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.261575918773644,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:32:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:14:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 56.58694876708374,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.93941055041797,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:32:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:14:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 76756.42494399994,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 256178.08394700024,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:05:47 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:21:10 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 57101.352517459985,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 241217.3788715066,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:05:47 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:21:10 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69993.83625249992,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 242909.12422749988,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:05:47 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:21:10 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.00693326917761,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.78426943271543,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:05:47 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:21:10 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 64.64969922823357,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.42910055791909,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:05:47 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:21:10 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13629.244735499924,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 14962.165949000337,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:45:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 04:12:43 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 205.90700912799548,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 217.35480540932926,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:45:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 04:12:43 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 158.40073799972743,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 174.4372419998399,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:45:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 04:12:43 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 106.67080802803109,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 120.31899528070943,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:45:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 04:12:43 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 111.6921246030719,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 126.0186063754553,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:45:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 04:12:43 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 255002.6453329999,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2412.8427380001085,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:20:21 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:40:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 240243.91990748668,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 116.05806883201392,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:20:21 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:40:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 241767.39791549972,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 81.62716600008935,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:20:21 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:40:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69.7344433797077,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.967667828441932,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:20:21 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:40:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65.95666295226302,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.19396913740613,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:20:21 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:40:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2445.613765499729,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 3648.4311744998195,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:39:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:22:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 116.16249440534011,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 142.61817750334254,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:39:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:22:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 82.27358949989139,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 95.64381549989776,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:39:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:22:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.96014548999124,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.391557942030953,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:39:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:22:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.162734591422716,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 23.682450065742795,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:39:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:22:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1941.047039500063,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1931.3982129997385,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:27:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 04:00:00 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 79.00893331999518,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 95.64826490332962,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:27:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 04:00:00 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.485431000230164,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 61.139810999520705,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:27:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 04:00:00 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.129155345630748,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.212292750231466,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:27:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 04:00:00 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.12759747485139,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.533745229745174,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 03:27:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 04:00:00 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6071.857499000089,
+            "value": 6043.03864350004,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:57:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:53:48 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 119.62412981668572,
+            "value": 121.09042400334602,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:57:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:53:48 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.97832849989572,
+            "value": 81.08035050008766,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:57:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:53:48 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.80621436769108,
+            "value": 39.646203415611396,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:57:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:53:48 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.397060414084116,
+            "value": 39.31455497377305,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:57:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:53:48 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59540.021137000054,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13850.136857500274,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:40:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:46:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37168.83478003866,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 203.4305926473353,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:40:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:46:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36646.56828200009,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 160.73287000017444,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:40:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:46:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 97.34496673627908,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 108.9987464124662,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:40:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:46:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 99.2032687169866,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 113.73618585393356,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-19 02:40:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:46:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
@@ -75130,668 +75130,668 @@ window.BENCHMARK_DATA = {
           "timestamp": "2024-05-20T21:44:29Z",
           "url": "https://github.com/neuralmagic/nm-vllm/commit/93183d6b21fd1f42fc2a98b93e46fca0c5530b40"
         },
-        "date": 1716276873695,
+        "date": 1716363120528,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5888.232021000476,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 256771.280315,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:58:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:18:01 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 121.28707645070001,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 241967.81838042397,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:58:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:18:01 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 91.33425299933151,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 243518.75813349965,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:58:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:18:01 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 53.37600865038578,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.92201654990878,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:58:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:18:01 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 46.240458468117346,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.57899807596435,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:58:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:18:01 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1851.7477995001173,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 15245.16342600009,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:34:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 04:09:38 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 92.63186055335609,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 216.33090558799086,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:34:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 04:09:38 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59.01871750029386,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 170.93354750022627,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:34:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 04:09:38 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.64062083609474,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 123.57023909569018,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:34:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 04:09:38 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.908801976978634,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 128.89090979804146,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:34:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 04:09:38 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 62561.73622250026,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 76459.87004700009,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:42:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:03:23 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 26827.57049859066,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 57006.08940392401,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:42:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:03:23 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24709.98807849992,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69956.17225149977,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:42:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:03:23 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 186.2852890606116,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.29421221482295,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:42:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:03:23 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 197.76614333508624,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 64.44521783417453,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:42:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:03:23 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 76591.36254999999,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16063.256238000577,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:02:18 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 05:03:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 56769.70654236267,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 285.1757610933564,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:02:18 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 05:03:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69590.44582899992,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 163.56607699981396,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:02:18 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 05:03:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.06704888151754,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 148.20962904427824,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:02:18 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 05:03:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 64.64701087512805,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 135.77424767327517,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:02:18 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 05:03:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1948.544970999592,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2590.1457790000677,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:28:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 04:02:56 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 78.7087402533507,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 122.22204088800088,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:28:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 04:02:56 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.53587200021502,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 84.77577700023176,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:28:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 04:02:56 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.09703730208491,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19.27610465963713,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:28:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 04:02:56 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.17204182207614,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.255476069253277,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:28:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 04:02:56 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 15923.402555499706,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 3649.4989170000736,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 05:06:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:19:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 245.11491717732738,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 142.08588893001132,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 05:06:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:19:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 159.51965949989244,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 93.60186949970739,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 05:06:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:19:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 146.06644579933197,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.37970848197325,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 05:06:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:19:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 133.87246110308337,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 23.666738033050542,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 05:06:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:19:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2035.2116740000383,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5305.091374999392,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:54:00 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:29:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 81.47769093337047,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 182.49676534935378,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:54:00 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:29:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.839236000010715,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:54:00 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 173.33045750001475,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:29:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.604968096061906,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.56192385121597,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:54:00 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:29:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.710429787194647,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.86431237682397,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:54:00 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:29:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6043.739634999952,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1938.3338355000888,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:22:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:25:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 126.0259374199946,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 79.67521980001038,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:22:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:25:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 85.7043535000912,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.12038950031638,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:22:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:25:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.47326180115146,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.079684189198728,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:22:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:25:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.24005488804747,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.16000790993628,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:22:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:25:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6097.635424999908,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6054.55432500014,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:46:14 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:23:35 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 104.50908731998727,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 128.47732568000956,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:46:14 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:23:35 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69.17343300005996,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 90.02754549999281,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:46:14 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:23:35 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34.57012586129777,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.41897263375693,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:46:14 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:23:35 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.0636363200751,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.26074101871709,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:46:14 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:23:35 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59794.577934000015,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1921.9436020002831,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:36:57 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:56:53 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37813.855549205335,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 96.61242698001236,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:36:57 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:56:53 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37292.644641000035,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 61.55323299981319,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:36:57 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:56:53 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 97.51843281166308,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.216317727657616,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:36:57 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:56:53 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 99.77894709484394,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.480939822172344,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:36:57 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:56:53 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5321.007210000062,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1826.2239115001648,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:32:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:30:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 182.49545777331514,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 93.34109191331663,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:32:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:30:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 173.38897650006402,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 55.82000150025124,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:32:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:30:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.54371866717419,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.62939598497101,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:32:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:30:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.85150928037011,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.92246008794239,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:32:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:30:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5097.145164000722,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2410.6342294999195,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:50:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:37:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 89.19441409999611,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 116.91471860398451,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:50:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:37:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 61.55316599961225,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 82.25686200012206,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:50:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:37:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.27375442270062,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18.08641513766197,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:50:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:37:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 30.899791572255083,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.237355544863274,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:50:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:37:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2582.9528250001204,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6061.561180500121,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 04:06:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:54:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 121.40620600400871,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 120.74932165665511,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 04:06:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:54:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 84.30327799987936,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 75.64902700005405,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 04:06:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:54:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19.327078396834594,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.64483831073297,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 04:06:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:54:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.348156597758088,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.25501033452916,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 04:06:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:54:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 7676.953527500018,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6371.696383000028,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:28:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:17:10 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 155.45015838399922,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 110.48925968000124,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:28:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:17:10 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 126.44363349988907,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69.49467699996603,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:28:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:17:10 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 57.14807488799022,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.201560438764346,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:28:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:17:10 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 56.86473735310059,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.90810849399123,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:28:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:17:10 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6375.89961599997,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5821.214592499928,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:14:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:56:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 111.17996672000194,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 118.52209673600494,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:14:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:56:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 70.82110750002357,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 90.16225599953032,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:14:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:56:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.261575918773644,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 53.09503225674646,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:14:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:56:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.93941055041797,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 46.1659372870581,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:14:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:56:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 256178.08394700024,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 62754.508411499046,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:21:10 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:39:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 241217.3788715066,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 27318.900191153323,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:21:10 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:39:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 242909.12422749988,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 25762.201995500618,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:21:10 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:39:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.78426943271543,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 185.7308705063005,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:21:10 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:39:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.42910055791909,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 196.39547295298112,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:21:10 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:39:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 14962.165949000337,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2027.2287369994046,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 04:12:43 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:50:39 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 217.35480540932926,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 81.79010776007392,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 04:12:43 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:50:39 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 174.4372419998399,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.287139500378544,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 04:12:43 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:50:39 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 120.31899528070943,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.60009808279639,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 04:12:43 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:50:39 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 126.0186063754553,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.687424613845359,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 04:12:43 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:50:39 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2412.8427380001085,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5068.800384000497,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:40:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:47:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 116.05806883201392,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 89.11502234658353,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:40:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:47:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 81.62716600008935,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 62.95106050038157,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:40:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:47:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.967667828441932,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.22859554011948,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:40:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:47:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.19396913740613,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 30.890091700003964,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:40:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:47:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3648.4311744998195,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13774.644650499795,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:22:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:43:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 142.61817750334254,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 206.66789126134367,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:22:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:43:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 95.64381549989776,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 160.7057569999597,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:22:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:43:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.391557942030953,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 109.51795528662116,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:22:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:43:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 23.682450065742795,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 113.96346420574625,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-21 04:22:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:43:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1931.3982129997385,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6115.162760999965,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 04:00:00 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:47:18 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 95.64826490332962,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 104.56159974003337,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 04:00:00 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:47:18 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 61.139810999520705,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.11591900013991,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 04:00:00 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:47:18 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.212292750231466,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34.617765827755456,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 04:00:00 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:47:18 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.533745229745174,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.101717808082036,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 04:00:00 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:47:18 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6043.03864350004,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7772.880707999889,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:53:48 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:30:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 121.09042400334602,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 156.48812977466272,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:53:48 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:30:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 81.08035050008766,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 133.6518690000048,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:53:48 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:30:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.646203415611396,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 57.264786195731034,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:53:48 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:30:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.31455497377305,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 56.87179617812733,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 02:53:48 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:30:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13850.136857500274,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 60296.60475000003,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:46:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:38:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 203.4305926473353,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38417.961550126,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:46:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:38:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 160.73287000017444,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38246.879616499995,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:46:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:38:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 108.9987464124662,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 98.11643894172259,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:46:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:38:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 113.73618585393356,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 100.4101346244016,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-21 03:46:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:38:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
@@ -75807,1355 +75807,1355 @@ window.BENCHMARK_DATA = {
             "username": "web-flow",
             "email": "noreply@github.com"
           },
-          "id": "93183d6b21fd1f42fc2a98b93e46fca0c5530b40",
-          "message": "increase timeouts (#253)\n\nSUMMARY:\r\n* increase \"RELEASE\" workflow timeouts to 12 hours\r\n\r\nTEST PLAN:\r\nwill cherry pick and trigger job manually on release, `v0.3.0`\r\n\r\nCo-authored-by: andy-neuma <andy@neuralmagic.com>",
-          "timestamp": "2024-05-20T21:44:29Z",
-          "url": "https://github.com/neuralmagic/nm-vllm/commit/93183d6b21fd1f42fc2a98b93e46fca0c5530b40"
+          "id": "a10b8316b838ebf315aafc4e0429e1399eca3307",
+          "message": "`requirements-dev.txt` and workflow patches (#255)\n\nSUMMARY:\r\n* update `requirements-dev.txt` to address `urllib` dependency\r\n* update \"build test\" only be reusable\r\n* update \"benchmark\" workflow to disambiguate artifacts\r\n\r\nTEST PLAN:\r\nruns on remote push\r\n\r\n---------\r\n\r\nCo-authored-by: Michael Goin <michael@neuralmagic.com>\r\nCo-authored-by: dhuangnm <74931910+dhuangnm@users.noreply.github.com>\r\nCo-authored-by: andy-neuma <andy@neuralmagic.com>\r\nCo-authored-by: Domenic Barbuzzi <dbarbuzzi@gmail.com>",
+          "timestamp": "2024-05-22T13:44:43Z",
+          "url": "https://github.com/neuralmagic/nm-vllm/commit/a10b8316b838ebf315aafc4e0429e1399eca3307"
         },
-        "date": 1716363120528,
+        "date": 1716449679848,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 256771.280315,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7689.986569999974,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:18:01 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:32:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 241967.81838042397,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 155.2369954986707,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:18:01 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:32:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 243518.75813349965,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 127.68052250009987,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:18:01 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:32:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.92201654990878,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 57.18177578822051,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:18:01 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:32:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.57899807596435,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 56.90111482577333,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:18:01 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:32:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 15245.16342600009,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6355.970685499983,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 04:09:38 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:19:18 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 216.33090558799086,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 111.16883072000064,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 04:09:38 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:19:18 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 170.93354750022627,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 70.46714699993117,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 04:09:38 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:19:18 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 123.57023909569018,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.17667501279266,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 04:09:38 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:19:18 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 128.89090979804146,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.897573278567734,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 04:09:38 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:19:18 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 76459.87004700009,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13543.124993999754,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:03:23 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:45:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 57006.08940392401,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 198.0917725726455,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:03:23 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:45:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69956.17225149977,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 157.69420799961154,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:03:23 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:45:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.29421221482295,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 106.5214924319586,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:03:23 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:45:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 64.44521783417453,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 111.27711537817672,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:03:23 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:45:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16063.256238000577,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 14884.52258549978,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 05:03:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 04:12:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 285.1757610933564,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 215.83828740533318,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 05:03:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 04:12:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 163.56607699981396,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 170.05572449988904,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 05:03:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 04:12:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 148.20962904427824,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 119.93837186669334,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 05:03:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 04:12:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 135.77424767327517,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 125.709955622744,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 05:03:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 04:12:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2590.1457790000677,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59568.68839799995,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 04:02:56 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:40:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 122.22204088800088,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37536.996530118005,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 04:02:56 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:40:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 84.77577700023176,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36862.34628149998,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 04:02:56 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:40:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19.27610465963713,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 97.34058150278607,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 04:02:56 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:40:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.255476069253277,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 99.78903259935451,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 04:02:56 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:40:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3649.4989170000736,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 76216.40349400013,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:19:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:05:29 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 142.08588893001132,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 56605.960194843996,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:19:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:05:29 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 93.60186949970739,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69334.1744034999,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:19:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:05:29 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.37970848197325,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67.27457006131601,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:19:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:05:29 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 23.666738033050542,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65.15658977573332,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:19:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:05:29 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5305.091374999392,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2413.9468949997536,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:29:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:39:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 182.49676534935378,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 115.32430701464425,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:29:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:39:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 173.33045750001475,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 82.14013299993894,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:29:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:39:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.56192385121597,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.99891797558831,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:29:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:39:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.86431237682397,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.15897576666761,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:29:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:39:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1938.3338355000888,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5087.44845199999,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:25:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:50:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 79.67521980001038,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 88.80579291998099,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:25:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:50:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.12038950031638,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 63.061083001230145,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:25:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:50:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.079684189198728,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.240965857212494,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:25:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:50:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.16000790993628,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 30.85187867530749,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:25:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:50:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6054.55432500014,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1943.3179324996672,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:23:35 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:27:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 128.47732568000956,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 79.02932675337676,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:23:35 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:27:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 90.02754549999281,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.14830800022173,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:23:35 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:27:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.41897263375693,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.069767070678836,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:23:35 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:27:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.26074101871709,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.176234212828936,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:23:35 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:27:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1921.9436020002831,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6100.955769499933,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:56:53 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:49:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 96.61242698001236,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 104.57615327331648,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:56:53 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:49:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 61.55323299981319,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65.58089699979064,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:56:53 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:49:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.216317727657616,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34.58406245663734,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:56:53 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:49:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.480939822172344,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.07420007437409,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:56:53 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:49:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1826.2239115001648,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5270.538775500427,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:30:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:31:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 93.34109191331663,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 181.91739862662266,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:30:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:31:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 55.82000150025124,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 173.0050875003144,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:30:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:31:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.62939598497101,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.47199793280612,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:30:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:31:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.92246008794239,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.66351316804006,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:30:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:31:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2410.6342294999195,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 3635.538525999891,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:37:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:21:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 116.91471860398451,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 143.14882953669135,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:37:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:21:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 82.25686200012206,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 93.81988950008235,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:37:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:21:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18.08641513766197,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.17861995539896,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:37:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:21:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.237355544863274,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 23.656972808143404,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:37:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:21:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6061.561180500121,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5823.356046999834,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:54:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:58:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 120.74932165665511,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 119.88542066930434,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:54:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:58:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 75.64902700005405,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 89.16509849950671,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:54:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:58:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.64483831073297,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 53.13534165651794,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:54:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:58:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.25501033452916,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 46.2948623041938,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:54:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:58:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6371.696383000028,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2552.110619999439,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:17:10 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 04:05:24 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 110.48925968000124,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 120.35040164531893,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:17:10 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 04:05:24 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69.49467699996603,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 83.91137899980095,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:17:10 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 04:05:24 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.201560438764346,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19.003036375024585,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:17:10 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 04:05:24 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.90810849399123,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.2926273737124,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:17:10 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 04:05:24 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5821.214592499928,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16172.425026000383,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:56:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 05:06:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 118.52209673600494,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 274.12945982464953,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:56:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 05:06:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 90.16225599953032,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 160.27451699937956,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:56:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 05:06:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 53.09503225674646,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 147.62205870226447,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:56:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 05:06:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 46.1659372870581,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 135.15480796223878,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:56:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 05:06:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 62754.508411499046,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1921.1439794999023,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:39:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:59:22 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 27318.900191153323,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 95.49463854331407,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:39:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:59:22 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 25762.201995500618,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59.16418050037464,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:39:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:59:22 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 185.7308705063005,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.252081158818903,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:39:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:59:22 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 196.39547295298112,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.528253547893332,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:39:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:59:22 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2027.2287369994046,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 254997.02183650038,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:50:39 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:20:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 81.79010776007392,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 240218.253697654,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:50:39 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:20:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.287139500378544,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 241757.30261750004,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:50:39 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:20:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.60009808279639,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69.71335484664382,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:50:39 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:20:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.687424613845359,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65.96072925992397,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:50:39 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:20:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5068.800384000497,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6035.140677500067,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:47:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:25:43 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 89.11502234658353,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 127.43279252667244,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:47:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:25:43 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 62.95106050038157,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 84.55342100000962,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:47:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:25:43 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.22859554011948,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.28536140570839,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:47:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:25:43 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 30.890091700003964,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.236489911805975,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-22 04:47:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:25:43 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13774.644650499795,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6032.537427500074,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:43:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:56:59 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 206.66789126134367,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 121.9273855433418,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:43:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:56:59 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 160.7057569999597,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 82.7403040000263,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:43:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:56:59 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 109.51795528662116,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.52752941719753,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:43:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:56:59 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 113.96346420574625,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.282266362386494,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 03:43:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:56:59 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6115.162760999965,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1822.5031120000494,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:47:18 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:33:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 104.56159974003337,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 92.59179489331775,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:47:18 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:33:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.11591900013991,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 57.79737500006377,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:47:18 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:33:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34.617765827755456,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.556154617036514,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:47:18 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:33:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.101717808082036,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.89281882411575,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:47:18 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:33:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 7772.880707999889,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 64642.84791799946,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:30:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:42:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 156.48812977466272,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 28202.068138578023,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:30:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:42:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 133.6518690000048,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 26515.03411000067,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:30:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:42:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 57.264786195731034,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 187.5715645663519,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:30:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:42:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 56.87179617812733,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 198.60651942622803,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:30:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:42:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 60296.60475000003,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2030.1320085000043,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:38:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:52:52 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38417.961550126,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.92254640001556,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:38:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:52:52 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38246.879616499995,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.536833000056504,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:38:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:52:52 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 98.11643894172259,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.594711202005476,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:38:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:52:52 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 100.4101346244016,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.670738943688308,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-22 02:38:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:52:52 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
       {
         "commit": {
           "author": {
-            "name": "Andy Linfoot",
-            "username": "andy-neuma",
-            "email": "78757007+andy-neuma@users.noreply.github.com"
+            "name": "dhuangnm",
+            "username": "dhuangnm",
+            "email": "74931910+dhuangnm@users.noreply.github.com"
           },
           "committer": {
             "name": "GitHub",
             "username": "web-flow",
             "email": "noreply@github.com"
           },
-          "id": "a10b8316b838ebf315aafc4e0429e1399eca3307",
-          "message": "`requirements-dev.txt` and workflow patches (#255)\n\nSUMMARY:\r\n* update `requirements-dev.txt` to address `urllib` dependency\r\n* update \"build test\" only be reusable\r\n* update \"benchmark\" workflow to disambiguate artifacts\r\n\r\nTEST PLAN:\r\nruns on remote push\r\n\r\n---------\r\n\r\nCo-authored-by: Michael Goin <michael@neuralmagic.com>\r\nCo-authored-by: dhuangnm <74931910+dhuangnm@users.noreply.github.com>\r\nCo-authored-by: andy-neuma <andy@neuralmagic.com>\r\nCo-authored-by: Domenic Barbuzzi <dbarbuzzi@gmail.com>",
-          "timestamp": "2024-05-22T13:44:43Z",
-          "url": "https://github.com/neuralmagic/nm-vllm/commit/a10b8316b838ebf315aafc4e0429e1399eca3307"
+          "id": "a6b94433cc411da4e03724d54c5b158a24cfc6b3",
+          "message": "update install commands (#264)\n\nUse nm pypi to install.\r\n\r\n---------\r\n\r\nCo-authored-by: dhuangnm <dhuang@MacBook-Pro-2.local>",
+          "timestamp": "2024-05-23T22:20:55Z",
+          "url": "https://github.com/neuralmagic/nm-vllm/commit/a6b94433cc411da4e03724d54c5b158a24cfc6b3"
         },
-        "date": 1716449679848,
+        "date": 1716536565926,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 7689.986569999974,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 53340.341240000045,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:32:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:45:34 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 155.2369954986707,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 32647.24735656332,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:32:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:45:34 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 127.68052250009987,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 32484.890060499994,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:32:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:45:34 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 57.18177578822051,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 94.19022086013526,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:32:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:45:34 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 56.90111482577333,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 96.75279759147365,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:32:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:45:34 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6355.970685499983,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 56350.74879249987,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:19:18 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:45:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 111.16883072000064,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 23603.44053233068,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:19:18 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:45:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 70.46714699993117,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 21319.333429500148,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:19:18 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:45:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.17667501279266,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 180.05547488093498,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:19:18 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:45:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.897573278567734,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 191.6181682981477,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:19:18 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:45:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13543.124993999754,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6328.47620550001,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:45:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:18:12 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 198.0917725726455,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 107.63078116000695,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:45:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:18:12 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 157.69420799961154,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67.90298399999983,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:45:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:18:12 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 106.5214924319586,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.021877670575236,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:45:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:18:12 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 111.27711537817672,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.7119793768181,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:45:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:18:12 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 14884.52258549978,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1927.5843179998446,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 04:12:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:31:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 215.83828740533318,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.19803643329699,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 04:12:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:31:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 170.05572449988904,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38.83859500001563,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 04:12:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:31:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 119.93837186669334,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 10.948486448532625,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 04:12:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:31:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 125.709955622744,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.07130429856349,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 04:12:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:31:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59568.68839799995,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1922.216698000284,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:40:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:03:31 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37536.996530118005,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 94.95481403333845,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:40:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:03:31 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36862.34628149998,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 60.12456300004487,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:40:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:03:31 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 97.34058150278607,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.021855861203537,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:40:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:03:31 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 99.78903259935451,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.31126985500981,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:40:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:03:31 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 76216.40349400013,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 10706.843501000094,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:05:29 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:50:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 56605.960194843996,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 189.20717229067245,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:05:29 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:50:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69334.1744034999,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 146.12494499988316,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:05:29 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:50:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 67.27457006131601,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 81.11347146428858,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:05:29 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:50:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65.15658977573332,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 82.72714163173411,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:05:29 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:50:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2413.9468949997536,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6021.900845499886,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:39:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:01:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 115.32430701464425,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 120.95130278667511,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:39:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:01:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 82.14013299993894,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 82.55972399979328,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:39:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:01:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.99891797558831,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.442295126045764,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:39:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:01:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.15897576666761,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.16522131586314,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:39:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:01:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5087.44845199999,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2501.4737395003976,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:50:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:09:32 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 88.80579291998099,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 119.45848303999931,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:50:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:09:32 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 63.061083001230145,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 83.39095849987643,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:50:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:09:32 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.240965857212494,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18.586016224269795,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:50:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:09:32 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 30.85187867530749,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.826070068509377,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:50:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:09:32 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1943.3179324996672,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2018.6891855000795,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:27:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:57:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 79.02932675337676,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 81.39448462671376,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:27:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:57:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.14830800022173,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38.17358449987296,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:27:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:57:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.069767070678836,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.47898532661828,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:27:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:57:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.176234212828936,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.611666316849625,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:27:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:57:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6100.955769499933,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1795.7626519996666,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:49:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:37:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 104.57615327331648,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 91.60398087663452,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:49:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:37:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65.58089699979064,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.510263999778545,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:49:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:37:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34.58406245663734,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.359187912905757,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:49:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:37:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.07420007437409,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.727949840223024,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:49:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:37:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5270.538775500427,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5015.686206999817,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:31:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:54:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 181.91739862662266,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 88.56763556000563,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:31:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:54:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 173.0050875003144,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 61.70900599954621,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:31:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:54:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.47199793280612,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.956494874308596,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:31:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:54:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.66351316804006,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 30.625132983095348,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:31:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:54:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3635.538525999891,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5964.362075999929,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:21:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:30:52 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 143.14882953669135,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 124.80836461333107,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:21:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:30:52 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 93.81988950008235,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 87.86407200000212,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:21:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:30:52 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.17861995539896,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.03774788189492,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:21:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:30:52 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 23.656972808143404,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.01274685221,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:21:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:30:52 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5823.356046999834,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 248859.66981499997,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:58:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:24:40 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 119.88542066930434,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 234459.43021540332,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:58:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:24:40 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 89.16509849950671,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 235739.0611275,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:58:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:24:40 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 53.13534165651794,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69.13166612947958,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:58:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:24:40 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 46.2948623041938,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65.17691391666763,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:58:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:24:40 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2552.110619999439,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11799.690713000473,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 04:05:24 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:16:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 120.35040164531893,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 200.45587572466198,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 04:05:24 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:16:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 83.91137899980095,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 154.61897100021815,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 04:05:24 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:16:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19.003036375024585,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 91.01456649864781,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 04:05:24 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:16:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.2926273737124,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 93.56831145706713,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 04:05:24 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:16:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16172.425026000383,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2360.9765269998206,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 05:06:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:43:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 274.12945982464953,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 114.06260132800647,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 05:06:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:43:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 160.27451699937956,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 78.12535400034903,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 05:06:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:43:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 147.62205870226447,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.382147388704105,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 05:06:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:43:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 135.15480796223878,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 15.60249533650649,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 05:06:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:43:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1921.1439794999023,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 3616.067349000332,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:59:22 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:25:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 95.49463854331407,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 142.33096500331158,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:59:22 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:25:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59.16418050037464,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 96.21500950015616,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:59:22 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:25:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.252081158818903,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.055657327705525,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:59:22 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:25:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.528253547893332,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 23.363944819167397,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:59:22 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:25:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 254997.02183650038,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5632.193468999503,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:20:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:02:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 240218.253697654,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 115.86905475065336,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:20:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:02:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 241757.30261750004,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 86.12576200084732,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:20:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:02:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69.71335484664382,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 51.53382027128091,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:20:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:02:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65.96072925992397,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 44.77461915881346,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:20:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:02:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6035.140677500067,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6073.618323000119,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:25:43 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:54:35 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 127.43279252667244,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 104.26460736663405,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:25:43 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:54:35 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 84.55342100000962,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67.37401249984032,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:25:43 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:54:35 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.28536140570839,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34.37037158497149,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:25:43 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:54:35 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.236489911805975,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34.94458470579859,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:25:43 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:54:35 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6032.537427500074,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11876.43002400091,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:56:59 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:09:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 121.9273855433418,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 160.14412551399556,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:56:59 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:09:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 82.7403040000263,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 128.1723874999443,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:56:59 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:09:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.52752941719753,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 105.47279808382501,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:56:59 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:09:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.282266362386494,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 94.25909209936462,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 02:56:59 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:09:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1822.5031120000494,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 73406.11455199996,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:33:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:10:17 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 92.59179489331775,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 54311.549439408,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:33:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:10:17 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 57.79737500006377,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66464.88848750005,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:33:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:10:17 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.556154617036514,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.72266576174421,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:33:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:10:17 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.89281882411575,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 63.966167132927936,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:33:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:10:17 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 64642.84791799946,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7404.249196000023,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:42:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:37:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 28202.068138578023,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 154.10822315466854,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:42:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:37:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 26515.03411000067,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 127.76332000009916,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:42:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:37:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 187.5715645663519,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 55.44544366717936,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:42:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:37:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 198.60651942622803,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 54.98981760273222,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-23 04:42:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:37:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2030.1320085000043,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5116.336590000174,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:52:52 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:35:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.92254640001556,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 180.1112781680058,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:52:52 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:35:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.536833000056504,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 173.22934650019306,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:52:52 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:35:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.594711202005476,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38.228384017145665,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:52:52 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:35:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.670738943688308,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.34638776367882,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-23 03:52:52 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:35:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
@@ -77176,668 +77176,668 @@ window.BENCHMARK_DATA = {
           "timestamp": "2024-05-23T22:20:55Z",
           "url": "https://github.com/neuralmagic/nm-vllm/commit/a6b94433cc411da4e03724d54c5b158a24cfc6b3"
         },
-        "date": 1716536565926,
+        "date": 1716536817806,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 53340.341240000045,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6036.9134255000745,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:45:34 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:34:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 32647.24735656332,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 128.22891422666697,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:45:34 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:34:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 32484.890060499994,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 89.64084949991502,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:45:34 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:34:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 94.19022086013526,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.29985978654724,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:45:34 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:34:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 96.75279759147365,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.22647533997652,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:45:34 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:34:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 56350.74879249987,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1922.9179699996166,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:45:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:08:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 23603.44053233068,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 96.29596407001979,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:45:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:08:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 21319.333429500148,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 60.72993099996893,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:45:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:08:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 180.05547488093498,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.254491473531806,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:45:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:08:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 191.6181682981477,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.581384785427327,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:45:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:08:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6328.47620550001,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5865.428363000319,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:18:12 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:07:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 107.63078116000695,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 119.17474247867843,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:18:12 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:07:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 67.90298399999983,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 87.99209249900741,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:18:12 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:07:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.021877670575236,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 53.13994388472499,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:18:12 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:07:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.7119793768181,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 46.25050560318028,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:18:12 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:07:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1927.5843179998446,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13788.609590000306,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:31:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:54:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.19803643329699,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 200.7562045140021,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:31:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:54:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38.83859500001563,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 157.53075749989875,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:31:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:54:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 10.948486448532625,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 108.21229497936153,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:31:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:54:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.07130429856349,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 113.49323806909048,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:31:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:54:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1922.216698000284,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 257930.24714800002,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:03:31 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:29:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 94.95481403333845,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 242762.75297554734,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:03:31 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:29:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 60.12456300004487,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 244606.07823699957,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:03:31 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:29:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.021855861203537,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.98077336152181,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:03:31 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:29:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.31126985500981,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.57785580277736,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:03:31 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:29:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 10706.843501000094,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6121.938016500053,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:50:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:58:38 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 189.20717229067245,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 105.77741779333034,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:50:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:58:38 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 146.12494499988316,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67.98791049982356,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:50:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:58:38 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 81.11347146428858,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34.63266728413954,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:50:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:58:38 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 82.72714163173411,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.16002097251475,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:50:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:58:38 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6021.900845499886,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1841.6702359995725,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:01:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:42:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 120.95130278667511,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 92.74554867333487,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:01:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:42:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 82.55972399979328,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59.50209150023511,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:01:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:42:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.442295126045764,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.661615063284032,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:01:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:42:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.16522131586314,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.941003594092496,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:01:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:42:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2501.4737395003976,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2419.35477349989,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:09:32 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:48:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 119.45848303999931,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 115.5059897760002,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:09:32 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:48:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 83.39095849987643,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 81.34635549959057,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:09:32 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:48:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18.586016224269795,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18.00136931223196,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:09:32 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:48:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.826070068509377,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.246505667705023,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:09:32 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:48:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2018.6891855000795,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6034.086707999904,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:57:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:05:56 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 81.39448462671376,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 121.38761330666662,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:57:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:05:56 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38.17358449987296,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.75800400024491,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:57:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:05:56 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.47898532661828,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.61820284429493,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:57:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:05:56 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.611666316849625,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.31828487961084,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:57:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:05:56 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1795.7626519996666,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5090.033450500414,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:37:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:59:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 91.60398087663452,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 91.7804306066076,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:37:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:59:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.510263999778545,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65.67713300046307,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:37:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:59:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.359187912905757,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.23894181575603,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:37:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:59:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.727949840223024,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 30.854331450733856,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:37:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:59:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5015.686206999817,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 76102.08756499992,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:54:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:14:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 88.56763556000563,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 56401.26771139466,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:54:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:14:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 61.70900599954621,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69115.46357449993,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:54:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:14:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.956494874308596,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65.94664253804291,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:54:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:14:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 30.625132983095348,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 64.49061364211835,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:54:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:14:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5964.362075999929,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2039.5816205004849,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:30:52 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:01:56 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 124.80836461333107,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.73425066667066,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:30:52 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:01:56 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 87.86407200000212,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37.44177899989154,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:30:52 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:01:56 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.03774788189492,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.61755039425222,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:30:52 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:01:56 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.01274685221,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.687218283229393,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:30:52 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:01:56 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 248859.66981499997,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 60704.40484450001,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:24:40 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:49:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 234459.43021540332,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38640.03059703801,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:24:40 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:49:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 235739.0611275,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38752.15475250002,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:24:40 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:49:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69.13166612947958,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 98.00013161536351,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:24:40 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:49:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65.17691391666763,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 100.45309630879403,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:24:40 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:49:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11799.690713000473,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 15843.160680000437,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:16:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:14:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 200.45587572466198,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 238.37949193667492,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:16:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:14:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 154.61897100021815,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 152.5966659992264,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:16:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:14:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 91.01456649864781,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 145.5245050964936,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:16:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:14:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 93.56831145706713,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 133.85664370317943,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:16:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:14:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2360.9765269998206,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 3650.308398499874,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:43:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:30:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 114.06260132800647,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 142.73153383998155,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:43:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:30:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 78.12535400034903,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 93.141590000414,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:43:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:30:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.382147388704105,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.28558014734185,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:43:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:30:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 15.60249533650649,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 23.59825115081777,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:43:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:30:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3616.067349000332,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 61973.708557499776,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:25:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:50:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 142.33096500331158,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 26749.070237649325,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:25:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:50:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 96.21500950015616,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 25255.849778500306,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:25:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:50:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.055657327705525,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 184.85499686074058,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:25:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:50:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 23.363944819167397,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 196.2129969969764,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:25:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:50:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5632.193468999503,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1943.4454174997882,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:02:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:36:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 115.86905475065336,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 79.29053529998782,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:02:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:36:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 86.12576200084732,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38.866527499976655,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:02:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:36:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 51.53382027128091,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.08816577639735,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:02:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:36:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 44.77461915881346,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.178469397792009,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:02:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:36:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6073.618323000119,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6370.195135500012,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:54:35 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:28:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 104.26460736663405,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 108.33617884666637,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:54:35 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:28:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 67.37401249984032,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69.33415199995352,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:54:35 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:28:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34.37037158497149,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.22914209162545,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:54:35 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:28:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34.94458470579859,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.92209173912628,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:54:35 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:28:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11876.43002400091,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5320.986331498716,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:09:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:40:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 160.14412551399556,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 181.93842036265414,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:09:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:40:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 128.1723874999443,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 173.00369149961625,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:09:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:40:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 105.47279808382501,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.66642364158503,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:09:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:40:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 94.25909209936462,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.79658968764954,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:09:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:40:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 73406.11455199996,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7782.100346499988,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:10:17 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:41:27 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 54311.549439408,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 156.5081379680014,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:10:17 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:41:27 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66464.88848750005,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 130.07041599996683,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:10:17 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:41:27 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.72266576174421,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 57.203279359686796,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:10:17 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:41:27 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 63.966167132927936,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 56.921139528492176,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:10:17 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:41:27 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 7404.249196000023,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2569.8311404999004,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:37:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:14:27 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 154.10822315466854,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 120.11115059466101,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:37:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:14:27 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 127.76332000009916,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 84.52654550001171,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:37:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:14:27 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 55.44544366717936,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19.055629099922392,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:37:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:14:27 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 54.98981760273222,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.188879242395696,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:37:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:14:27 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5116.336590000174,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 14923.420833999444,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:35:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:21:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 180.1112781680058,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 220.1605600566727,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:35:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:21:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 173.22934650019306,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 175.95376700000998,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:35:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:21:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38.228384017145665,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 120.74639971695998,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:35:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:21:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.34638776367882,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 126.56766392840541,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:35:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:21:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
@@ -77858,668 +77858,668 @@ window.BENCHMARK_DATA = {
           "timestamp": "2024-05-23T22:20:55Z",
           "url": "https://github.com/neuralmagic/nm-vllm/commit/a6b94433cc411da4e03724d54c5b158a24cfc6b3"
         },
-        "date": 1716536817806,
+        "date": 1716536967175,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6036.9134255000745,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 3674.2376610000065,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:34:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:33:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 128.22891422666697,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 144.098352476661,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:34:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:33:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 89.64084949991502,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 91.25462249994598,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:34:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:33:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.29985978654724,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.580405419507663,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:34:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:33:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.22647533997652,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 23.82968114245691,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:34:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:33:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1922.9179699996166,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6158.242706500005,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:08:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:00:47 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 96.29596407001979,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 103.9525071600201,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:08:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:00:47 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 60.72993099996893,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.32726299994829,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:08:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:00:47 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.254491473531806,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34.84905220986917,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:08:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:00:47 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.581384785427327,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.341917398678106,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:08:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:00:47 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5865.428363000319,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66139.41254149994,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:07:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:51:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 119.17474247867843,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 43122.886156283326,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:07:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:51:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 87.99209249900741,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 43070.87479899997,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:07:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:51:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 53.13994388472499,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 100.30706422659057,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:07:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:51:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 46.25050560318028,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 103.03386879880344,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:07:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:51:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13788.609590000306,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18679.325246499957,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:54:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:24:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 200.7562045140021,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 231.30088457267934,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:54:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:24:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 157.53075749989875,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 185.6688005000251,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:54:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:24:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 108.21229497936153,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 153.79304180612957,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:54:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:24:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 113.49323806909048,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 157.74625207767156,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:54:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:24:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 257930.24714800002,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1851.8035160000181,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:29:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:45:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 242762.75297554734,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 93.9118324999951,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:29:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:45:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 244606.07823699957,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 61.61253549998946,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:29:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:45:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.98077336152181,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.751144219890206,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:29:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:45:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.57785580277736,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.088412348256844,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:29:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:45:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6121.938016500053,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2647.3707849995662,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:58:38 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:17:09 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 105.77741779333034,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 121.92175404267255,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:58:38 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:17:09 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 67.98791049982356,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 84.58569550020911,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:58:38 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:17:09 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34.63266728413954,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19.90610722549118,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:58:38 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:17:09 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.16002097251475,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.8848995473422,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:58:38 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:17:09 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1841.6702359995725,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1952.816832500048,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:42:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:11:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 92.74554867333487,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 97.18456532334434,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:42:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:11:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59.50209150023511,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 62.47611900016636,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:42:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:11:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.661615063284032,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.603912201429557,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:42:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:11:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.941003594092496,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.803728676648895,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:42:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:11:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2419.35477349989,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19390.905797000414,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:48:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:18:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 115.5059897760002,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1893.6356199340166,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:48:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:18:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 81.34635549959057,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 314.27889649967256,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:48:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:18:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18.00136931223196,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 159.7659810246803,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:48:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:18:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.246505667705023,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 147.45676941805075,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:48:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:18:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6034.086707999904,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5496.9637855001565,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:05:56 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:43:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 121.38761330666662,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 183.92184810803397,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:05:56 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:43:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.75800400024491,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 173.60201850078738,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:05:56 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:43:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.61820284429493,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41.53539019691746,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:05:56 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:43:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.31828487961084,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38.7112078901483,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:05:56 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:43:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5090.033450500414,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5146.833499498825,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:59:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:02:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 91.7804306066076,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 93.67963537338558,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:59:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:02:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65.67713300046307,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67.43261250085197,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:59:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:02:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.23894181575603,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.51844826831605,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:59:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:02:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 30.854331450733856,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31.104311450581175,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:59:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:02:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 76102.08756499992,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6061.453177999965,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:14:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:08:21 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 56401.26771139466,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 122.60437605000484,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:14:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:08:21 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69115.46357449993,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.86141350008802,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:14:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:08:21 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65.94664253804291,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.935846385705155,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:14:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:08:21 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 64.49061364211835,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.556099753483785,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:14:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:08:21 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2039.5816205004849,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1959.2217435001658,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:01:56 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:38:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.73425066667066,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.03675333999126,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:01:56 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:38:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37.44177899989154,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.935721999881935,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:01:56 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:38:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.61755039425222,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.231196407969382,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:01:56 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:38:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.687218283229393,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.280820497209362,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:01:56 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:38:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 60704.40484450001,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 258816.12126400022,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:49:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:31:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38640.03059703801,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 244034.62704731268,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:49:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:31:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38752.15475250002,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 245447.883156,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:49:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:31:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 98.00013161536351,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69.6744499885317,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:49:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:31:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 100.45309630879403,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.75007709108667,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:49:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:31:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 15843.160680000437,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6111.26218849995,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:14:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:36:18 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 238.37949193667492,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 127.27864344000031,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:14:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:36:18 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 152.5966659992264,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 87.03411599992705,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:14:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:36:18 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 145.5245050964936,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.89042627643755,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:14:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:36:18 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 133.85664370317943,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.66110326928299,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:14:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:36:18 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3650.308398499874,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 70477.60344899961,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:30:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:54:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 142.73153383998155,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31452.343251931336,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:30:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:54:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 93.141590000414,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 30347.126394500265,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:30:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:54:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.28558014734185,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 192.51186562524066,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:30:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:54:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 23.59825115081777,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 204.3971678225958,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:30:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:54:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 61973.708557499776,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16623.214362499766,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:50:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:58:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 26749.070237649325,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 219.05289470665957,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:50:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:58:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 25255.849778500306,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 171.0892044998218,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:50:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:58:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 184.85499686074058,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 136.97556747857035,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:50:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:58:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 196.2129969969764,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 143.10401456193338,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:50:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:58:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1943.4454174997882,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2505.5321414997707,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:36:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:51:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 79.29053529998782,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 116.93201387466252,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:36:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:51:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38.866527499976655,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 81.59506499987401,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:36:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:51:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.08816577639735,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18.742624648202092,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:36:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:51:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.178469397792009,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.925428114448234,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:36:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:51:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6370.195135500012,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6096.751922499607,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:28:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:10:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 108.33617884666637,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 122.48616470934212,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:28:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:10:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69.33415199995352,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 92.64066650030145,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:28:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:10:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.22914209162545,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 55.061820024085044,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:28:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:10:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.92209173912628,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 48.02706380161126,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:28:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:10:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5320.986331498716,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2048.1824245002827,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:40:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:04:49 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 181.93842036265414,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 82.24031411338729,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:40:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:04:49 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 173.00369149961625,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 42.830491999666265,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:40:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:04:49 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.66642364158503,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.696145043637065,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:40:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:04:49 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.79658968764954,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.771937668840932,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:40:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:04:49 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 7782.100346499988,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 79057.19572249995,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:41:27 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:16:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 156.5081379680014,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59007.21182767066,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:41:27 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:16:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 130.07041599996683,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 72297.3000075001,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:41:27 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:16:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 57.203279359686796,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67.75301193806251,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:41:27 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:16:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 56.921139528492176,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65.9750591477635,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:41:27 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:16:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2569.8311404999004,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7990.959115999999,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:14:27 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:43:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 120.11115059466101,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 157.34963135199723,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:14:27 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:43:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 84.52654550001171,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 131.77123749994735,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:14:27 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:43:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19.055629099922392,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59.10565390062231,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:14:27 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:43:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.188879242395696,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.683953754812755,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:14:27 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:43:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 14923.420833999444,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6405.790125000067,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:21:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:29:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 220.1605600566727,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 111.3461874133312,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:21:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:29:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 175.95376700000998,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 72.50817299996015,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:21:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:29:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 120.74639971695998,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.402849202360066,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:21:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:29:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 126.56766392840541,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37.10504541279752,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:21:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:29:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
@@ -78540,668 +78540,668 @@ window.BENCHMARK_DATA = {
           "timestamp": "2024-05-23T22:20:55Z",
           "url": "https://github.com/neuralmagic/nm-vllm/commit/a6b94433cc411da4e03724d54c5b158a24cfc6b3"
         },
-        "date": 1716536967175,
+        "date": 1716622601693,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3674.2376610000065,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6031.264369499922,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:33:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:27:46 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 144.098352476661,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 127.44546363666662,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:33:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:27:46 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 91.25462249994598,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 84.30550150001181,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:33:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:27:46 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.580405419507663,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.3387351725222,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:33:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:27:46 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 23.82968114245691,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.18327821422785,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:33:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:27:46 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6158.242706500005,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6109.72768150009,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:00:47 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:51:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 103.9525071600201,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 104.34567401334182,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:00:47 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:51:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.32726299994829,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.67937049996908,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:00:47 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:51:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34.84905220986917,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34.591272418168025,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:00:47 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:51:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.341917398678106,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.09092174712987,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:00:47 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:51:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66139.41254149994,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 256462.49853600012,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:51:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:22:12 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 43122.886156283326,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 242283.18882904603,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:51:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:22:12 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 43070.87479899997,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 243359.83851049969,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:51:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:22:12 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 100.30706422659057,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69.40998149569158,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:51:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:22:12 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 103.03386879880344,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.80840559486172,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:51:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:22:12 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18679.325246499957,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6056.822688500006,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:24:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:59:01 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 231.30088457267934,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 121.36565666999104,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:24:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:59:01 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 185.6688005000251,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 83.84811750011067,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:24:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:59:01 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 153.79304180612957,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.67301465337826,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:24:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:59:01 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 157.74625207767156,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.34406697679638,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:24:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:59:01 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1851.8035160000181,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 77362.13200599991,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:45:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:07:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 93.9118324999951,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 57560.854989348,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:45:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:07:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 61.61253549998946,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 70575.79673300005,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:45:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:07:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.751144219890206,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67.98880239881521,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:45:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:07:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.088412348256844,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 64.72136673644505,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:45:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:07:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2647.3707849995662,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1829.048375500406,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:17:09 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:35:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 121.92175404267255,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 92.66710255002788,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:17:09 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:35:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 84.58569550020911,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 57.33029700013503,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:17:09 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:35:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19.90610722549118,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.626857479513577,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:17:09 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:35:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.8848995473422,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.911935682773704,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:17:09 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:35:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1952.816832500048,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 62127.03208150015,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:11:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:43:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 97.18456532334434,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 26797.982637383324,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:11:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:43:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 62.47611900016636,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 25023.304212999392,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:11:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:43:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.603912201429557,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 185.5416417471531,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:11:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:43:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.803728676648895,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 197.664749041314,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:11:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:43:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19390.905797000414,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5090.012251000189,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:18:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:52:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1893.6356199340166,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 89.4689235599435,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:18:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:52:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 314.27889649967256,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 62.12264149871771,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:18:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:52:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 159.7659810246803,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.19461875864989,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:18:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:52:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 147.45676941805075,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 30.825566204415544,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:18:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:52:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5496.9637855001565,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59761.77965350007,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:43:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:42:40 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 183.92184810803397,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37601.94518529133,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:43:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:42:40 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 173.60201850078738,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37383.080577499866,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:43:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:42:40 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 41.53539019691746,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 97.36235305567693,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:43:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:42:40 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38.7112078901483,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 99.9264288851969,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:43:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:42:40 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5146.833499498825,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1940.5441085004895,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:02:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:29:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 93.67963537338558,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.5169629466521,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:02:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:29:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 67.43261250085197,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37.2674049995112,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:02:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:29:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.51844826831605,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.064534162064737,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:02:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:29:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 31.104311450581175,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.140311892694836,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:02:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:29:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6061.453177999965,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2411.4901895000003,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:08:21 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:41:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 122.60437605000484,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 115.89777418667168,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:08:21 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:41:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.86141350008802,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 83.16683850034678,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:08:21 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:41:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.935846385705155,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.954706164294176,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:08:21 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:41:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.556099753483785,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.164560357492107,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:08:21 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:41:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1959.2217435001658,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13703.912494499946,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:38:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:47:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.03675333999126,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 200.798709573323,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:38:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:47:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.935721999881935,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 159.21959099978267,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:38:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:47:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.231196407969382,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 107.51241611053686,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:38:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:47:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.280820497209362,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 112.25538014624625,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:38:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:47:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 258816.12126400022,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2582.2408904996337,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:31:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:07:17 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 244034.62704731268,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 120.13682931333702,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:31:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:07:17 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 245447.883156,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 84.3190564996803,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:31:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:07:17 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69.6744499885317,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19.15072832813987,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:31:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:07:17 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.75007709108667,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.324471008133802,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:31:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:07:17 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6111.26218849995,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7693.3109779999995,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:36:18 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:34:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 127.27864344000031,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 154.85766106533146,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:36:18 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:34:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 87.03411599992705,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 125.95628700000816,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:36:18 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:34:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.89042627643755,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 57.20704889317462,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:36:18 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:34:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.66110326928299,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 56.742528299871175,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:36:18 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:34:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 70477.60344899961,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 14864.066802000252,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:54:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:13:57 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 31452.343251931336,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 213.90278705802183,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:54:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:13:57 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 30347.126394500265,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 171.76611750028314,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:54:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:13:57 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 192.51186562524066,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 120.36697423094968,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:54:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:13:57 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 204.3971678225958,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 126.41396716324671,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 04:54:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:13:57 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16623.214362499766,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1920.3525215002628,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:58:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:01:15 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 219.05289470665957,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 95.54855116998927,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:58:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:01:15 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 171.0892044998218,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 61.15634800016778,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:58:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:01:15 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 136.97556747857035,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.170856558273053,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:58:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:01:15 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 143.10401456193338,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.425277037628705,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:58:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:01:15 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2505.5321414997707,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6348.104188499974,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:51:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:16:06 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 116.93201387466252,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 108.28916381333177,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:51:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:16:06 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 81.59506499987401,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69.46765200007121,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:51:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:16:06 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18.742624648202092,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.17308675822627,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:51:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:16:06 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.925428114448234,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.84379338958343,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:51:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:16:06 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6096.751922499607,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2030.2066539998123,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:10:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:54:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 122.48616470934212,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 81.28930491332237,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:10:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:54:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 92.64066650030145,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.891775000091,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:10:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:54:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 55.061820024085044,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.581854473220956,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:10:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:54:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 48.02706380161126,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.685599667260387,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-24 05:10:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:54:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2048.1824245002827,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 3643.7305149993335,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:04:49 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:23:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 82.24031411338729,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 142.4523402499896,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:04:49 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:23:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 42.830491999666265,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 96.58675099854008,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:04:49 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:23:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.696145043637065,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.319572147250838,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:04:49 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:23:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.771937668840932,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 23.63598660201902,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 04:04:49 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:23:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 79057.19572249995,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5920.897044999947,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:16:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:00:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59007.21182767066,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 120.08681493996846,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:16:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:00:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 72297.3000075001,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 90.7725160004702,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:16:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:00:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 67.75301193806251,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 53.34940247664822,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:16:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:00:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65.9750591477635,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 46.33074625403821,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 03:16:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:00:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 7990.959115999999,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5312.3135955002,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:43:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:33:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 157.34963135199723,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 183.42202690800573,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:43:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:33:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 131.77123749994735,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 175.19969800014223,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:43:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:33:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59.10565390062231,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.6223924948184,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:43:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:33:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.683953754812755,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.70505213202464,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:43:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:33:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6405.790125000067,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 15973.229238000386,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:29:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:07:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 111.3461874133312,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 248.27122630997718,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:29:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:07:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 72.50817299996015,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 158.4686200003489,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:29:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:07:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.402849202360066,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 146.39758932653328,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:29:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:07:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37.10504541279752,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 133.34830056155036,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-24 02:29:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:07:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
@@ -79222,668 +79222,668 @@ window.BENCHMARK_DATA = {
           "timestamp": "2024-05-23T22:20:55Z",
           "url": "https://github.com/neuralmagic/nm-vllm/commit/a6b94433cc411da4e03724d54c5b158a24cfc6b3"
         },
-        "date": 1716622601693,
+        "date": 1716622960721,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6031.264369499922,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2504.7344915001304,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:27:46 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:10:16 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 127.44546363666662,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 119.13965160133496,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:27:46 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:10:16 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 84.30550150001181,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 82.43372849983643,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:27:46 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:10:16 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.3387351725222,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18.53969207923632,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:27:46 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:10:16 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.18327821422785,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.685054020185998,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:27:46 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:10:16 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6109.72768150009,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2361.4371645003303,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:51:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:44:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 104.34567401334182,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 114.3273589413499,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:51:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:44:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.67937049996908,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 78.56191150040104,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:51:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:44:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34.591272418168025,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.418019590081464,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:51:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:44:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.09092174712987,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 15.635949923512728,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:51:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:44:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 256462.49853600012,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 3611.443037999379,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:22:12 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:26:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 242283.18882904603,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 143.57639907332668,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:22:12 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:26:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 243359.83851049969,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 92.61489900109154,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:22:12 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:26:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69.40998149569158,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 23.954262386158387,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:22:12 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:26:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.80840559486172,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 23.42831613275324,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:22:12 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:26:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6056.822688500006,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7404.824202499981,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:59:01 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:38:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 121.36565666999104,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 155.15427302533803,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:59:01 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:38:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 83.84811750011067,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 128.7996835000058,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:59:01 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:38:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.67301465337826,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 55.452128171401796,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:59:01 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:38:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.34406697679638,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 54.96808436744122,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:59:01 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:38:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 77362.13200599991,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5968.643334000035,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:07:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:32:18 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 57560.854989348,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 127.07519155334617,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:07:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:32:18 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 70575.79673300005,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 83.55286099993009,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:07:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:32:18 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 67.98880239881521,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.01037937361815,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:07:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:32:18 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 64.72136673644505,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.04350030887357,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:07:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:32:18 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1829.048375500406,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5147.891583999808,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:35:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:36:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 92.66710255002788,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 180.405902318671,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:35:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:36:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 57.33029700013503,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 171.71799599964288,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:35:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:36:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.626857479513577,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38.26570201250069,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:35:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:36:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.911935682773704,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.395516495092856,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:35:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:36:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 62127.03208150015,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12136.588689000746,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:43:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:09:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 26797.982637383324,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 161.92133887665963,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:43:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:09:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 25023.304212999392,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 129.32198549970053,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:43:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:09:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 185.5416417471531,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 107.62503236256481,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:43:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:09:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 197.664749041314,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 96.61493116307709,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:43:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:09:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5090.012251000189,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 247683.95640800032,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:52:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:26:08 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 89.4689235599435,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 234411.032387272,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:52:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:26:08 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 62.12264149871771,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 234740.4976380003,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:52:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:26:08 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.19461875864989,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.25225341825055,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:52:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:26:08 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 30.825566204415544,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65.13249446891975,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:52:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:26:08 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59761.77965350007,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6076.790150499846,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:42:40 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:55:45 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37601.94518529133,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 104.47128707999885,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:42:40 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:55:45 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37383.080577499866,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.7422000000879,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:42:40 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:55:45 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 97.36235305567693,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34.4251162420238,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:42:40 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:55:45 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 99.9264288851969,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34.91772018699045,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:42:40 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:55:45 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1940.5441085004895,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 10745.297296500212,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:29:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:51:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.5169629466521,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 188.61554564599464,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:29:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:51:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37.2674049995112,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 140.71318149990475,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:29:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:51:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.064534162064737,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 82.44202160884846,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:29:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:51:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.140311892694836,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 84.05510422120436,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:29:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:51:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2411.4901895000003,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 73398.27332400001,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:41:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:11:43 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 115.89777418667168,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 54209.12408943067,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:41:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:11:43 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 83.16683850034678,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66537.2725405,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:41:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:11:43 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.954706164294176,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.09792893281684,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:41:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:11:43 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.164560357492107,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 64.21518476361895,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:41:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:11:43 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13703.912494499946,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5761.7960370007495,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:47:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:03:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 200.798709573323,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 119.40179928800596,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:47:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:03:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 159.21959099978267,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 92.17287200044666,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:47:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:03:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 107.51241611053686,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 52.716252278091964,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:47:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:03:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 112.25538014624625,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 46.087029270857535,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:47:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:03:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2582.2408904996337,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5019.673364499795,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:07:17 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:54:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 120.13682931333702,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 89.60357411326791,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:07:17 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:54:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 84.3190564996803,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 61.26066899923899,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:07:17 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:54:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19.15072832813987,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.93568086410549,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:07:17 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:54:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.324471008133802,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 30.609382320239355,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:07:17 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:54:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 7693.3109779999995,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2012.1260879996044,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:34:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:58:00 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 154.85766106533146,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.32166984004411,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:34:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:58:00 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 125.95628700000816,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.357617999939976,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:34:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:58:00 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 57.20704889317462,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.458834435932188,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:34:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:58:00 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 56.742528299871175,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.56015452206276,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:34:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:58:00 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 14864.066802000252,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 56270.08645049955,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:13:57 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:46:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 213.90278705802183,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 23728.259643247988,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:13:57 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:46:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 171.76611750028314,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 21910.15640349997,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:13:57 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:46:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 120.36697423094968,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 180.0254877996741,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:13:57 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:46:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 126.41396716324671,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 191.79463254781044,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:13:57 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:46:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1920.3525215002628,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 53237.17141300017,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:01:15 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:46:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 95.54855116998927,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 32490.671532157336,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:01:15 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:46:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 61.15634800016778,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31882.60320900008,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:01:15 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:46:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.170856558273053,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 94.03625464512164,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:01:15 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:46:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.425277037628705,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 96.38718661263374,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:01:15 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:46:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6348.104188499974,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6329.448286499996,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:16:06 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:25:52 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 108.28916381333177,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 107.97678893333416,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:16:06 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:25:52 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69.46765200007121,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69.86343199997691,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:16:06 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:25:52 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.17308675822627,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.05337502494129,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:16:06 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:25:52 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.84379338958343,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.76410726584316,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:16:06 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:25:52 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2030.2066539998123,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6007.674074999841,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:54:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:03:18 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 81.28930491332237,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 118.78175235666428,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:54:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:03:18 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.891775000091,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 78.45285550024528,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:54:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:03:18 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.581854473220956,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.34144362994333,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:54:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:03:18 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.685599667260387,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38.9417495712645,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:54:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:03:18 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3643.7305149993335,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1925.279082999623,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:23:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:32:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 142.4523402499896,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 78.25064233332039,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:23:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:32:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 96.58675099854008,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38.08967000031771,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:23:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:32:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.319572147250838,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 10.965364766620553,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:23:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:32:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 23.63598660201902,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.035743212651537,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:23:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:32:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5920.897044999947,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1800.6607509996684,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:00:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:38:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 120.08681493996846,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 91.66373576337112,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:00:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:38:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 90.7725160004702,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 55.42086900004506,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:00:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:38:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 53.34940247664822,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.420129707509071,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:00:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:38:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 46.33074625403821,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.642768863311378,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:00:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:38:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5312.3135955002,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11904.731997000454,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:33:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:16:46 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 183.42202690800573,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 201.0525258406712,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:33:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:16:46 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 175.19969800014223,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 151.09472750009445,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:33:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:16:46 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.6223924948184,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 91.91106637531124,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:33:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:16:46 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.70505213202464,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 94.19579719517606,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:33:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:16:46 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 15973.229238000386,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1896.5291289996458,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:07:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:04:14 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 248.27122630997718,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 94.49556736662696,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:07:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:04:14 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 158.4686200003489,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59.09356399979515,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:07:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:04:14 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 146.39758932653328,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.103081050226754,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:07:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:04:14 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 133.34830056155036,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.383000956841924,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:07:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:04:14 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
@@ -79904,668 +79904,668 @@ window.BENCHMARK_DATA = {
           "timestamp": "2024-05-23T22:20:55Z",
           "url": "https://github.com/neuralmagic/nm-vllm/commit/a6b94433cc411da4e03724d54c5b158a24cfc6b3"
         },
-        "date": 1716622960721,
+        "date": 1716622967578,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2504.7344915001304,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6087.053470500905,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:10:16 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:04:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 119.13965160133496,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 121.27033984530863,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:10:16 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:04:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 82.43372849983643,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 90.28590799971425,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:10:16 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:04:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18.53969207923632,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 55.27311377605129,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:10:16 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:04:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.685054020185998,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 48.228761984107344,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:10:16 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:04:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2361.4371645003303,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19589.319870999134,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:44:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:12:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 114.3273589413499,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1984.5874082339953,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:44:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:12:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 78.56191150040104,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 309.7395859995231,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:44:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:12:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.418019590081464,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 159.5124573048829,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:44:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:12:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 15.635949923512728,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 147.74307465226795,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:44:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:12:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3611.443037999379,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1867.0875650004746,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:26:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:38:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 143.57639907332668,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 93.74675671998375,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:26:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:38:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 92.61489900109154,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.528012999886414,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:26:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:38:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 23.954262386158387,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.87366424046731,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:26:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:38:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 23.42831613275324,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.188751277435541,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:26:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:38:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 7404.824202499981,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66141.291937,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:38:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:45:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 155.15427302533803,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 43037.73437554266,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:38:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:45:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 128.7996835000058,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 42584.142079000005,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:38:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:45:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 55.452128171401796,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 101.05397899932792,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:38:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:45:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 54.96808436744122,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 103.00951704392612,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:38:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:45:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5968.643334000035,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18228.306296000028,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:32:18 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:17:30 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 127.07519155334617,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 236.82018156332683,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:32:18 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:17:30 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 83.55286099993009,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 194.85675999976593,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:32:18 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:17:30 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.01037937361815,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 150.58851241833838,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:32:18 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:17:30 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.04350030887357,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 154.70345973423565,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:32:18 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:17:30 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5147.891583999808,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5145.437896000658,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:36:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:56:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 180.405902318671,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 92.9104367600424,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:36:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:56:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 171.71799599964288,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 64.93177600077615,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:36:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:56:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38.26570201250069,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.52623326125513,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:36:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:56:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.395516495092856,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31.122674256257213,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:36:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:56:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12136.588689000746,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 70695.30652600042,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:09:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:48:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 161.92133887665963,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31733.29802215001,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:09:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:48:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 129.32198549970053,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 30824.094797000726,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:09:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:48:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 107.62503236256481,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 192.99178777409415,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:09:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:48:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 96.61493116307709,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 203.85377518748848,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:09:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:48:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 247683.95640800032,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5441.562446500029,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:26:08 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:37:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 234411.032387272,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 183.71218506135966,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:26:08 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:37:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 234740.4976380003,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 173.88336850035557,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:26:08 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:37:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.25225341825055,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41.32396189113019,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:26:08 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:37:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65.13249446891975,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38.381626289477126,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:26:08 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:37:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6076.790150499846,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 3683.198848999382,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:55:45 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:27:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 104.47128707999885,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 142.74309152326168,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:55:45 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:27:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.7422000000879,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 94.91837800123903,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:55:45 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:27:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34.4251162420238,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.700858438495132,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:55:45 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:27:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34.91772018699045,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 23.94484287721668,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:55:45 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:27:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 10745.297296500212,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6106.959968999945,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:51:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:02:02 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 188.61554564599464,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 122.64483668669224,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:51:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:02:02 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 140.71318149990475,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 88.6377919998722,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:51:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:02:02 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 82.44202160884846,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.181763937938484,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:51:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:02:02 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 84.05510422120436,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.75833014081126,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:51:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:02:02 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 73398.27332400001,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16295.719529999587,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:11:43 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:51:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 54209.12408943067,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 215.7550698966558,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:11:43 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:51:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66537.2725405,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 175.40716299981796,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:11:43 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:51:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.09792893281684,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 132.71233496425143,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:11:43 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:51:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 64.21518476361895,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 137.6976882858219,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:11:43 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:51:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5761.7960370007495,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2501.266753499749,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:03:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:45:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 119.40179928800596,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 117.88623574000788,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:03:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:45:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 92.17287200044666,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 83.03689400054282,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:03:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:45:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 52.716252278091964,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18.54806090135829,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:03:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:45:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 46.087029270857535,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.701945583532023,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:03:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:45:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5019.673364499795,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1954.6669180003846,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:54:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:04:32 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 89.60357411326791,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 96.90659437666909,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:54:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:04:32 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 61.26066899923899,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 62.649019000673434,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:54:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:04:32 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.93568086410549,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.496277096143636,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:54:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:04:32 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 30.609382320239355,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.771188410709488,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:54:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:04:32 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2012.1260879996044,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2053.8726159998077,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:58:00 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:58:17 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.32166984004411,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 81.95088673997209,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:58:00 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:58:17 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.357617999939976,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.46892750011466,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:58:00 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:58:17 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.458834435932188,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.762995127248177,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:58:00 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:58:17 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.56015452206276,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.824517697848322,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:58:00 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:58:17 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 56270.08645049955,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2641.5741414998593,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:46:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:10:36 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 23728.259643247988,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 122.56773874400096,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:46:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:10:36 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 21910.15640349997,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 87.30907450035374,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:46:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:10:36 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 180.0254877996741,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19.862708211385034,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:46:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:10:36 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 191.79463254781044,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.8965723674272,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:46:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:10:36 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 53237.17141300017,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7975.0274250000075,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:46:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:37:02 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 32490.671532157336,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 157.20777703200505,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:46:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:37:02 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 31882.60320900008,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 131.1412024999754,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:46:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:37:02 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 94.03625464512164,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59.22152148283052,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:46:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:37:02 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 96.38718661263374,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.85854985409576,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:46:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:37:02 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6329.448286499996,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6414.791728000011,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:25:52 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:23:45 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 107.97678893333416,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 111.42439213332712,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:25:52 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:23:45 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69.86343199997691,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 71.28911649994052,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:25:52 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:23:45 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.05337502494129,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.492517040696626,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:25:52 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:23:45 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.76410726584316,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37.22096165468236,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:25:52 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:23:45 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6007.674074999841,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6151.79374000013,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:03:18 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:54:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 118.78175235666428,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 105.00769699332523,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:03:18 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:54:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 78.45285550024528,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67.24624499997844,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:03:18 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:54:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.34144362994333,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34.81058708757571,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:03:18 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:54:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38.9417495712645,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.314714704806526,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:03:18 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:54:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1925.279082999623,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1960.4870684997877,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:32:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:32:42 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 78.25064233332039,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.26428046660537,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:32:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:32:42 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38.08967000031771,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.91496050002752,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:32:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:32:42 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 10.965364766620553,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.222392342401225,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:32:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:32:42 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.035743212651537,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.286118063279309,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:32:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:32:42 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1800.6607509996684,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 259450.7627295002,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:38:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:25:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 91.66373576337112,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 245340.87704937998,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:38:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:25:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 55.42086900004506,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 246088.97628649994,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:38:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:25:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.420129707509071,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 70.7196112040547,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:38:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:25:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.642768863311378,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.9213712702126,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:38:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:25:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11904.731997000454,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 79124.52998549998,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:16:46 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:10:40 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 201.0525258406712,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59467.74121747067,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:16:46 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:10:40 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 151.09472750009445,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 72249.8947534998,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:16:46 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:10:40 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 91.91106637531124,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.52950054240793,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:16:46 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:10:40 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 94.19579719517606,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.22939064299659,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:16:46 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:10:40 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1896.5291289996458,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6101.417414000025,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:04:14 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:30:13 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 94.49556736662696,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 127.19921564999822,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:04:14 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:30:13 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59.09356399979515,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 84.65352849998453,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:04:14 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:30:13 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.103081050226754,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.80125146138109,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:04:14 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:30:13 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.383000956841924,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:04:14 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.610102991416355,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:30:13 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
@@ -80586,668 +80586,668 @@ window.BENCHMARK_DATA = {
           "timestamp": "2024-05-23T22:20:55Z",
           "url": "https://github.com/neuralmagic/nm-vllm/commit/a6b94433cc411da4e03724d54c5b158a24cfc6b3"
         },
-        "date": 1716622967578,
+        "date": 1716708770539,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6087.053470500905,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13719.834091999473,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:04:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:45:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 121.27033984530863,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 199.3323442593355,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:04:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:45:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 90.28590799971425,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 161.42437600001358,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:04:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:45:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 55.27311377605129,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 107.49689771698549,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:04:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:45:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 48.228761984107344,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 112.18992244145717,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:04:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:45:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19589.319870999134,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5296.767851999903,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:12:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:31:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1984.5874082339953,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 180.53457311204207,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:12:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:31:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 309.7395859995231,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 171.78857699946093,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:12:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:31:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 159.5124573048829,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.458506446936795,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:12:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:31:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 147.74307465226795,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.67174997975881,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 05:12:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:31:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1867.0875650004746,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2032.2572195000248,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:38:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:52:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 93.74675671998375,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.81863097331734,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:38:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:52:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.528012999886414,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37.790129000313755,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:38:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:52:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.87366424046731,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.582463725070378,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:38:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:52:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.188751277435541,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.698387482363728,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:38:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:52:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66141.291937,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2575.869489500292,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:45:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 04:05:01 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 43037.73437554266,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 120.25202191999172,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:45:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 04:05:01 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 42584.142079000005,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 85.06724200015015,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:45:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 04:05:01 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 101.05397899932792,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19.216379730991278,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:45:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 04:05:01 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 103.00951704392612,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.215824231136967,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:45:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 04:05:01 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18228.306296000028,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 15963.914889499392,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:17:30 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 05:05:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 236.82018156332683,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 252.89774244331053,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:17:30 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 05:05:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 194.85675999976593,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 158.27316349987086,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:17:30 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 05:05:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 150.58851241833838,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 146.5767083485674,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:17:30 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 05:05:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 154.70345973423565,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 134.35014069593458,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:17:30 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 05:05:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5145.437896000658,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7744.966965999993,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:56:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:32:04 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 92.9104367600424,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 156.67808693733514,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:56:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:32:04 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 64.93177600077615,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 130.4804339998782,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:56:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:32:04 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.52623326125513,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 57.31995415834249,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:56:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:32:04 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 31.122674256257213,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 56.84771027160454,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:56:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:32:04 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 70695.30652600042,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5091.644082499442,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:48:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:49:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 31733.29802215001,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 89.94019860650344,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:48:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:49:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 30824.094797000726,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 61.469466500057024,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:48:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:49:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 192.99178777409415,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.180142753186054,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:48:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:49:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 203.85377518748848,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 30.832656555818044,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:48:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:49:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5441.562446500029,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 14898.216261499783,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:37:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 04:11:42 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 183.71218506135966,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 216.7655932633364,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:37:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 04:11:42 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 173.88336850035557,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 168.2440224999482,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:37:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 04:11:42 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 41.32396189113019,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 120.29937097038504,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:37:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 04:11:42 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38.381626289477126,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 126.60961795147601,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:37:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 04:11:42 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3683.198848999382,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59762.60505950006,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:27:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:40:13 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 142.74309152326168,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37824.285817384,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:27:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:40:13 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 94.91837800123903,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37138.776646,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:27:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:40:13 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.700858438495132,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 97.65028427105449,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:27:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:40:13 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 23.94484287721668,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 100.05101871480682,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-25 04:27:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:40:13 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6106.959968999945,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6352.270934000046,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:02:02 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:19:06 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 122.64483668669224,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 110.53123650666824,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:02:02 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:19:06 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 88.6377919998722,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.64712799999961,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:02:02 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:19:06 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.181763937938484,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.163775880896324,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:02:02 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:19:06 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.75833014081126,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.864718307407074,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:02:02 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:19:06 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16295.719529999587,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 63241.37254749894,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:51:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:41:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 215.7550698966558,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 27463.514011721312,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:51:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:41:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 175.40716299981796,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 26067.371138500675,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:51:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:41:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 132.71233496425143,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 185.51776810311102,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:51:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:41:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 137.6976882858219,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 197.21703349379666,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:51:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:41:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2501.266753499749,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1919.3635840001662,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:45:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:58:59 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 117.88623574000788,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 95.73241906336382,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:45:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:58:59 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 83.03689400054282,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.73461299961491,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:45:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:58:59 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18.54806090135829,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.227132877671762,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:45:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:58:59 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.701945583532023,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.546944922548871,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:45:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:58:59 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1954.6669180003846,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 77040.78036249985,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:04:32 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:05:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 96.90659437666909,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 57015.35536195602,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:04:32 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:05:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 62.649019000673434,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 70062.93960500011,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:04:32 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:05:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.496277096143636,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67.87456157393004,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:04:32 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:05:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.771188410709488,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 64.98551638928194,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:04:32 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:05:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2053.8726159998077,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 3644.370075500319,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:58:17 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:21:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 81.95088673997209,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 143.1177609900169,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:58:17 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:21:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.46892750011466,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 92.1996570000374,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:58:17 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:21:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.762995127248177,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.16515333873722,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:58:17 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:21:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.824517697848322,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 23.60205459914785,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:58:17 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:21:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2641.5741414998593,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5880.626533000395,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:10:36 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:57:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 122.56773874400096,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 120.37698160937725,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:10:36 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:57:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 87.30907450035374,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 91.13219200116873,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:10:36 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:57:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19.862708211385034,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 53.17952147752356,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:10:36 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:57:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.8965723674272,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 46.375887840359105,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 04:10:36 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:57:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 7975.0274250000075,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 256106.76833649995,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:37:02 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:19:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 157.20777703200505,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 241175.34822180797,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:37:02 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:19:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 131.1412024999754,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 242843.23037950025,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:37:02 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:19:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59.22152148283052,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.7800826563357,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:37:02 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:19:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.85854985409576,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.41403563661216,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:37:02 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:19:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6414.791728000011,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6005.199937999919,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:23:45 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:25:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 111.42439213332712,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 127.69117487666374,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:23:45 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:25:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 71.28911649994052,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 85.98021049999716,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:23:45 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:25:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.492517040696626,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.394355428637475,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:23:45 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:25:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37.22096165468236,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.33758301703981,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:23:45 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:25:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6151.79374000013,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6105.812039000057,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:54:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:49:15 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 105.00769699332523,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 104.84896812001048,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:54:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:49:15 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 67.24624499997844,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.20544299994435,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:54:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:49:15 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34.81058708757571,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34.54418925176356,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:54:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:49:15 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.314714704806526,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.03690141021941,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:54:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:49:15 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1960.4870684997877,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6035.864570500053,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:32:42 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:56:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.26428046660537,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 123.6156219099712,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:32:42 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:56:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.91496050002752,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 82.08461649996934,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:32:42 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:56:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.222392342401225,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.57377718535252,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:32:42 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:56:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.286118063279309,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.19125505428643,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:32:42 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:56:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 259450.7627295002,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2404.0219759999673,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:25:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:38:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 245340.87704937998,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 115.56893722801397,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:25:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:38:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 246088.97628649994,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 81.24620750004397,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:25:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:38:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 70.7196112040547,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.934681919266144,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:25:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:38:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.9213712702126,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.157597388916138,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:25:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:38:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 79124.52998549998,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1937.9353819999778,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:10:40 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:26:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59467.74121747067,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 79.05925926001146,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:10:40 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:26:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 72249.8947534998,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.29892349990405,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:10:40 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:26:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.52950054240793,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.060224252329409,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:10:40 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:26:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.22939064299659,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.175544986865699,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 03:10:40 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:26:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6101.417414000025,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1853.6365575000673,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:30:13 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:32:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 127.19921564999822,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 92.67919123664494,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:30:13 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:32:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 84.65352849998453,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59.12286549983037,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:30:13 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:32:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.80125146138109,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.59777746379352,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:30:13 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:32:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.610102991416355,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.84972288918397,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-25 02:30:13 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:32:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
@@ -81268,668 +81268,668 @@ window.BENCHMARK_DATA = {
           "timestamp": "2024-05-23T22:20:55Z",
           "url": "https://github.com/neuralmagic/nm-vllm/commit/a6b94433cc411da4e03724d54c5b158a24cfc6b3"
         },
-        "date": 1716708770539,
+        "date": 1716796233577,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13719.834091999473,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6364.633738500004,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:45:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:13:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 199.3323442593355,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 110.9386039533274,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:45:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:13:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 161.42437600001358,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 70.67654700006187,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:45:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:13:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 107.49689771698549,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.20073086192139,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:45:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:13:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 112.18992244145717,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.898053901060116,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:45:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:13:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5296.767851999903,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6081.8272660001185,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:31:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:57:45 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 180.53457311204207,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 121.14610562666105,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:31:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:57:45 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 171.78857699946093,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 81.3581659999727,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:31:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:57:45 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.458506446936795,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.76893995195344,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:31:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:57:45 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.67174997975881,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.399661849383165,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:31:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:57:45 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2032.2572195000248,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1917.0592025002406,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:52:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:00:10 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.81863097331734,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 95.56634040335362,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:52:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:00:10 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37.790129000313755,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.77325799974642,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:52:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:00:10 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.582463725070378,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.22545734704069,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:52:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:00:10 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.698387482363728,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.533952667372015,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:52:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:00:10 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2575.869489500292,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7684.304952500042,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 04:05:01 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:33:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 120.25202191999172,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 155.3382852573356,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 04:05:01 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:33:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 85.06724200015015,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 128.05097249997743,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 04:05:01 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:33:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19.216379730991278,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 57.028611033938596,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 04:05:01 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:33:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.215824231136967,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 56.690790700825,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 04:05:01 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:33:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 15963.914889499392,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6061.761682999986,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 05:05:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:26:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 252.89774244331053,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 127.2390800299907,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 05:05:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:26:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 158.27316349987086,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 84.66533649993835,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 05:05:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:26:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 146.5767083485674,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.43393309823268,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 05:05:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:26:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 134.35014069593458,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.259457282322444,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 05:05:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:26:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 7744.966965999993,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6107.716720999861,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:32:04 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:50:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 156.67808693733514,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 103.38538521333551,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:32:04 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:50:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 130.4804339998782,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65.93042649978997,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:32:04 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:50:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 57.31995415834249,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34.5905017773301,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:32:04 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:50:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 56.84771027160454,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.097509785793804,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:32:04 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:50:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5091.644082499442,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2031.022674000269,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:49:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:53:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 89.94019860650344,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.5421797400292,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:49:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:53:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 61.469466500057024,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.857552000488795,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:49:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:53:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.180142753186054,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.58746713869253,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:49:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:53:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 30.832656555818044,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.640850055882305,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:49:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:53:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 14898.216261499783,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5848.073216000557,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 04:11:42 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:59:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 216.7655932633364,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 119.49546459466849,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 04:11:42 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:59:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 168.2440224999482,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 88.30563050014462,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 04:11:42 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:59:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 120.29937097038504,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 53.142382924865366,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 04:11:42 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:59:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 126.60961795147601,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 46.474124152207025,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 04:11:42 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:59:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59762.60505950006,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 255900.11857799982,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:40:13 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:20:51 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37824.285817384,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 240964.17156914668,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:40:13 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:20:51 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37138.776646,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 242644.34600000049,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:40:13 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:20:51 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 97.65028427105449,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.83144516244961,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:40:13 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:20:51 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 100.05101871480682,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.3410904844147,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:40:13 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:20:51 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6352.270934000046,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1823.9216125002713,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:19:06 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:33:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 110.53123650666824,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 92.40534543998365,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:19:06 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:33:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.64712799999961,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.77597750031782,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:19:06 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:33:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.163775880896324,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.526363259015222,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:19:06 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:33:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.864718307407074,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.851952907261081,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:19:06 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:33:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 63241.37254749894,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59915.679922999974,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:41:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:41:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 27463.514011721312,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37919.86216178468,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:41:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:41:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 26067.371138500675,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37338.46950349994,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:41:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:41:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 185.51776810311102,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 97.80755902200929,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:41:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:41:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 197.21703349379666,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 99.77823988364473,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:41:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:41:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1919.3635840001662,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1941.0032979999414,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:58:59 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:27:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 95.73241906336382,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 79.25834224004575,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:58:59 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:27:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.73461299961491,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41.81687200025408,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:58:59 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:27:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.227132877671762,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.103759690539231,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:58:59 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:27:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.546944922548871,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.141562020117878,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:58:59 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:27:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 77040.78036249985,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2574.1813324998475,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:05:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:06:12 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 57015.35536195602,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 120.24578948798566,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:05:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:06:12 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 70062.93960500011,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 84.40900949972274,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:05:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:06:12 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 67.87456157393004,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19.153861356958195,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:05:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:06:12 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 64.98551638928194,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.232438050034375,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:05:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:06:12 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3644.370075500319,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 76608.26362750004,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:21:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:06:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 143.1177609900169,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 56964.278056652016,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:21:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:06:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 92.1996570000374,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69819.15318949995,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:21:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:06:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.16515333873722,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67.88916489565437,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:21:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:06:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 23.60205459914785,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 64.55699893438523,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:21:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:06:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5880.626533000395,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 3631.3388734997716,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:57:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:22:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 120.37698160937725,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 142.52117958994253,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:57:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:22:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 91.13219200116873,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 90.93760449923138,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:57:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:22:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 53.17952147752356,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.313517914314907,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:57:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:22:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 46.375887840359105,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 23.61104615833451,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-26 04:57:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:22:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 256106.76833649995,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16183.035134999955,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:19:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:06:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 241175.34822180797,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 276.1944895666469,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:19:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:06:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 242843.23037950025,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 161.85223650063563,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:19:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:06:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.7800826563357,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 147.7125780492183,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:19:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:06:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.41403563661216,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 136.00812021365124,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:19:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:06:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6005.199937999919,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 14907.566597499681,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:25:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:12:53 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 127.69117487666374,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 214.5425809306859,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:25:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:12:53 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 85.98021049999716,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 170.90863349994834,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:25:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:12:53 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.394355428637475,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 120.74411065262855,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:25:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:12:53 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.33758301703981,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 125.70612291771327,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:25:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:12:53 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6105.812039000057,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5096.1581319998,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:49:15 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:50:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 104.84896812001048,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 89.35703474664479,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:49:15 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:50:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.20544299994435,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.95373600014864,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:49:15 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:50:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34.54418925176356,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.222790376713746,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:49:15 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:50:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.03690141021941,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 30.797951826792268,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:49:15 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:50:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6035.864570500053,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13693.951621999531,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:56:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:46:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 123.6156219099712,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 202.06395390866965,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:56:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:46:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 82.08461649996934,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 160.00548500005607,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:56:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:46:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.57377718535252,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 107.6035869178744,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:56:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:46:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.19125505428643,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 112.502632367524,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 02:56:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:46:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2404.0219759999673,
+            "value": 2423.249550500259,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:38:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:39:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 115.56893722801397,
+            "value": 115.57257120667418,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:38:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:39:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 81.24620750004397,
+            "value": 80.09124349973717,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:38:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:39:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.934681919266144,
+            "value": 18.016155885708706,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:38:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:39:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.157597388916138,
+            "value": 16.201786527420072,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:38:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:39:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1937.9353819999778,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 62925.006167501124,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:26:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:42:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 79.05925926001146,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 27203.82154675731,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:26:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:42:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.29892349990405,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 25508.403936501054,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:26:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:42:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.060224252329409,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 185.4872685244404,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:26:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:42:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.175544986865699,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 196.73251961813324,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:26:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:42:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1853.6365575000673,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5266.754461499659,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:32:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:32:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 92.67919123664494,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 182.3137356586764,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:32:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:32:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59.12286549983037,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 172.07800699998188,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:32:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:32:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.59777746379352,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.42113770176542,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:32:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:32:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.84972288918397,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.674829500863154,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-26 03:32:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:32:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
@@ -81950,668 +81950,668 @@ window.BENCHMARK_DATA = {
           "timestamp": "2024-05-23T22:20:55Z",
           "url": "https://github.com/neuralmagic/nm-vllm/commit/a6b94433cc411da4e03724d54c5b158a24cfc6b3"
         },
-        "date": 1716796233577,
+        "date": 1716796250867,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6364.633738500004,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 73557.1878830001,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:13:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:09:14 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 110.9386039533274,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 54274.971237412,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:13:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:09:14 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 70.67654700006187,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66496.79136700001,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:13:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:09:14 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.20073086192139,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67.08082973808818,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:13:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:09:14 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.898053901060116,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 64.16252433917934,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:13:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:09:14 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6081.8272660001185,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 251570.10278099982,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:57:45 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:23:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 121.14610562666105,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 237403.6621778293,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:57:45 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:23:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 81.3581659999727,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 238762.90595699992,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:57:45 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:23:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.76893995195344,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69.15832508481148,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:57:45 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:23:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.399661849383165,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65.70448818083523,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:57:45 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:23:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1917.0592025002406,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 55312.02972349911,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:00:10 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:44:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 95.56634040335362,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 23317.65573052867,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:00:10 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:44:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.77325799974642,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 21930.38267600059,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:00:10 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:44:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.22545734704069,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 179.27011979201416,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:00:10 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:44:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.533952667372015,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 190.4971275555548,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:00:10 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:44:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 7684.304952500042,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 10766.853013500167,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:33:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:49:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 155.3382852573356,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 191.8137156466728,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:33:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:49:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 128.05097249997743,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 148.7612689998059,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:33:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:49:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 57.028611033938596,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 81.71945132309382,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:33:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:49:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 56.690790700825,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 83.43081593084692,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:33:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:49:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6061.761682999986,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1806.416813999931,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:26:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:36:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 127.2390800299907,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 92.19676065001597,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:26:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:36:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 84.66533649993835,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 57.85134750021825,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:26:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:36:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.43393309823268,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.474161891497117,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:26:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:36:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.259457282322444,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.706609286777685,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:26:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:36:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6107.716720999861,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1926.0076245000164,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:50:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:30:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 103.38538521333551,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 78.15665756669962,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:50:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:30:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65.93042649978997,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38.7329220002357,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:50:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:30:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34.5905017773301,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 10.959522930240288,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:50:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:30:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.097509785793804,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.039550408273529,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:50:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:30:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2031.022674000269,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 53648.168086000165,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:53:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:44:29 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.5421797400292,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 32871.818626446,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:53:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:44:29 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.857552000488795,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 32546.4946269999,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:53:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:44:29 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.58746713869253,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 94.3608748026115,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:53:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:44:29 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.640850055882305,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 96.57838180275058,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:53:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:44:29 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5848.073216000557,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7398.480855999992,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:59:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:36:34 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 119.49546459466849,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 153.49252890133266,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:59:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:36:34 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 88.30563050014462,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 126.28433100007896,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:59:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:36:34 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 53.142382924865366,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 55.40085130142113,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:59:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:36:34 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 46.474124152207025,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 55.041959403026794,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:59:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:36:34 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 255900.11857799982,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6325.567019999994,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:20:51 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:23:53 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 240964.17156914668,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 110.07393185333513,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:20:51 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:23:53 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 242644.34600000049,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.99479499997096,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:20:51 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:23:53 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.83144516244961,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.99296529157036,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:20:51 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:23:53 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.3410904844147,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.6789553645544,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:20:51 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:23:53 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1823.9216125002713,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5637.292696499571,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:33:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:00:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 92.40534543998365,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 117.31337282798145,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:33:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:00:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.77597750031782,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 87.89296299983107,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:33:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:00:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.526363259015222,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 51.520604634899385,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:33:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:00:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.851952907261081,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 44.854367200206454,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:33:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:00:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59915.679922999974,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5980.5163245000585,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:41:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:30:03 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37919.86216178468,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 124.54726676999069,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:41:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:30:03 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37338.46950349994,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 85.09528350009532,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:41:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:30:03 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 97.80755902200929,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.12558342406395,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:41:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:30:03 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 99.77823988364473,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.9765445679687,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:41:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:30:03 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1941.0032979999414,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 3618.1140245007555,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:27:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:24:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 79.25834224004575,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 141.82623874327874,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:27:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:24:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 41.81687200025408,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 94.56672199985405,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:27:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:24:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.103759690539231,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.139670020996387,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:27:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:24:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.141562020117878,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 23.46367132795628,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:27:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:24:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2574.1813324998475,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12188.274419499066,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:06:12 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:07:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 120.24578948798566,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 161.0161329326705,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:06:12 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:07:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 84.40900949972274,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 127.06120350048877,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:06:12 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:07:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19.153861356958195,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 108.9521467868849,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:06:12 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:07:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.232438050034375,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 97.51454962550012,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:06:12 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:07:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 76608.26362750004,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1903.7392800000816,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:06:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:02:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 56964.278056652016,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 94.64907631331091,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:06:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:02:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69819.15318949995,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 60.35489200030497,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:06:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:02:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 67.88916489565437,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.068286042205282,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:06:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:02:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 64.55699893438523,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.353648010151852,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:06:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:02:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3631.3388734997716,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5171.4912204997745,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:22:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:34:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 142.52117958994253,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 180.80138122403878,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:22:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:34:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 90.93760449923138,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 170.40880849981477,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:22:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:34:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.313517914314907,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38.301702569622094,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:22:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:34:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 23.61104615833451,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.28060892167015,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:22:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:34:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16183.035134999955,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2484.973353499754,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:06:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:08:19 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 276.1944895666469,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 118.97007295466271,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:06:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:08:19 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 161.85223650063563,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 83.28220599969427,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:06:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:08:19 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 147.7125780492183,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18.468503229402156,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:06:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:08:19 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 136.00812021365124,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.58658233498964,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:06:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:08:19 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 14907.566597499681,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11867.508392499985,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:12:53 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:14:49 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 214.5425809306859,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 197.41663807199137,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:12:53 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:14:49 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 170.90863349994834,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 150.46568049956477,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:12:53 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:14:49 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 120.74411065262855,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 91.45414866305231,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:12:53 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:14:49 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 125.70612291771327,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 93.95484754743093,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:12:53 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:14:49 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5096.1581319998,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5143.1367664999925,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:50:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:52:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 89.35703474664479,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 94.24412510669451,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:50:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:52:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.95373600014864,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 70.25308599986602,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:50:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:52:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.222790376713746,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.44000860144758,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:50:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:52:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 30.797951826792268,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31.029967242201604,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:50:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:52:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13693.951621999531,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2017.1134680003888,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:46:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:56:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 202.06395390866965,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.84266767330823,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:46:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:56:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 160.00548500005607,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41.14306449992,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:46:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:56:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 107.6035869178744,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.477915355710886,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:46:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:56:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 112.502632367524,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.574823246872853,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:46:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:56:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2423.249550500259,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2402.1529614997235,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:39:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:42:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 115.57257120667418,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 114.60082032532955,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:39:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:42:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.09124349973717,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 78.58655049994923,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:39:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:42:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18.016155885708706,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.448930916745994,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:39:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:42:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.201786527420072,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 15.705658343132942,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:39:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:42:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 62925.006167501124,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6083.561831499992,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:42:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:53:31 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 27203.82154675731,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 103.6075839333292,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:42:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:53:31 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 25508.403936501054,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65.40378549993875,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:42:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:53:31 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 185.4872685244404,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34.45431691825654,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:42:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:53:31 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 196.73251961813324,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34.949767179674346,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:42:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:53:31 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5266.754461499659,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6001.731613000175,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:32:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:00:49 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 182.3137356586764,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 118.3772720366763,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:32:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:00:49 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 172.07800699998188,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 78.91565199997785,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:32:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:00:49 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.42113770176542,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.296355154165305,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:32:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:00:49 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.674829500863154,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.158321523451804,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:32:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:00:49 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
@@ -82632,668 +82632,668 @@ window.BENCHMARK_DATA = {
           "timestamp": "2024-05-23T22:20:55Z",
           "url": "https://github.com/neuralmagic/nm-vllm/commit/a6b94433cc411da4e03724d54c5b158a24cfc6b3"
         },
-        "date": 1716796250867,
+        "date": 1716796264101,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 73557.1878830001,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6090.712625499691,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:09:14 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:06:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 54274.971237412,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 121.75989160935812,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:09:14 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:06:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66496.79136700001,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 92.22087349917274,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:09:14 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:06:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 67.08082973808818,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 55.08633192431127,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:09:14 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:06:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 64.16252433917934,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 48.06244881207504,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:09:14 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:06:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 251570.10278099982,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2501.6659124999023,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:23:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:46:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 237403.6621778293,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 117.83828453599442,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:23:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:46:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 238762.90595699992,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 82.57532300012826,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:23:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:46:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69.15832508481148,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18.668762762910536,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:23:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:46:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65.70448818083523,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.743856165047696,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:23:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:46:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 55312.02972349911,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1971.930427499501,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:44:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:34:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 23317.65573052867,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.11258741336253,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:44:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:34:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 21930.38267600059,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.800313499654294,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:44:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:34:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 179.27011979201416,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.239623345137028,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:44:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:34:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 190.4971275555548,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.338959846483151,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:44:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:34:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 10766.853013500167,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2650.326251000479,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:49:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:12:44 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 191.8137156466728,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 121.98484990135087,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:49:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:12:44 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 148.7612689998059,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 85.89696350009035,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:49:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:12:44 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 81.71945132309382,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19.95477410125936,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:49:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:12:44 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 83.43081593084692,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.90629757986169,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:49:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:12:44 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1806.416813999931,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1870.622367499891,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:36:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:40:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 92.19676065001597,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 93.77259321667832,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:36:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:40:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 57.85134750021825,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59.49170350004351,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:36:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:40:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.474161891497117,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.819426943181075,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:36:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:40:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.706609286777685,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.086471520709258,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:36:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:40:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1926.0076245000164,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16360.486754500016,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:30:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:53:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 78.15665756669962,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 214.87865147801796,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:30:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:53:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38.7329220002357,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 172.61797999981354,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:30:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:53:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 10.959522930240288,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 133.23870575233246,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:30:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:53:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.039550408273529,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 139.48319644772516,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:30:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:53:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 53648.168086000165,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 79396.0280839999,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:44:29 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:12:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 32871.818626446,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59363.842118681336,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:44:29 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:12:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 32546.4946269999,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 72635.35564699987,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:44:29 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:12:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 94.3608748026115,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.99403910563898,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:44:29 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:12:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 96.57838180275058,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.204830952399,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:44:29 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:12:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 7398.480855999992,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 3683.4174590003386,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:36:34 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:29:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 153.49252890133266,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 143.18027123327434,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:36:34 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:29:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 126.28433100007896,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 94.91020449968346,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:36:34 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:29:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 55.40085130142113,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.714879681573194,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:36:34 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:29:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 55.041959403026794,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 23.938975221457362,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:36:34 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:29:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6325.567019999994,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5144.204072500543,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:23:53 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:57:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 110.07393185333513,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 91.63029272668307,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:23:53 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:57:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.99479499997096,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.391946499607,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:23:53 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:57:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.99296529157036,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.518711974408774,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:23:53 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:57:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.6789553645544,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31.08318313158899,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:23:53 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:57:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5637.292696499571,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19198.749321500145,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:00:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:13:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 117.31337282798145,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1733.9371251993132,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:00:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:13:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 87.89296299983107,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 288.0548259990974,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:00:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:13:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 51.520604634899385,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 159.1794857617087,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:00:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:13:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 44.854367200206454,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 146.9221872172329,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:00:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:13:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5980.5163245000585,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6406.453824000038,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:30:03 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:25:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 124.54726676999069,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 111.01004096666732,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:30:03 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:25:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 85.09528350009532,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 71.89030799997909,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:30:03 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:25:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.12558342406395,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.41407391325501,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:30:03 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:25:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.9765445679687,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37.153699872384635,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:30:03 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:25:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3618.1140245007555,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18101.753664499938,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:24:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:19:38 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 141.82623874327874,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 228.79141934532768,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:24:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:19:38 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 94.56672199985405,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 181.4439295003467,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:24:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:19:38 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.139670020996387,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 150.3248974939885,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:24:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:19:38 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 23.46367132795628,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 154.98063748205632,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:24:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:19:38 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12188.274419499066,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6116.125251000085,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:07:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:03:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 161.0161329326705,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 121.95577656666651,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:07:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:03:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 127.06120350048877,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 82.52334149983653,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:07:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:03:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 108.9521467868849,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.03362274360493,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:07:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:03:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 97.51454962550012,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.68617036866701,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:07:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:03:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1903.7392800000816,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 71091.10926299945,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:02:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:49:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 94.64907631331091,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 32103.595689588667,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:02:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:49:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 60.35489200030497,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31386.506036500577,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:02:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:49:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.068286042205282,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 193.23542843331893,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:02:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:49:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.353648010151852,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 203.83707845247366,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:02:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:49:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5171.4912204997745,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7972.799128499901,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:34:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:38:46 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 180.80138122403878,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 157.74704809600038,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:34:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:38:46 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 170.40880849981477,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 131.2453224999217,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:34:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:38:46 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38.301702569622094,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59.0586387482135,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:34:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:38:46 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.28060892167015,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.609164657590256,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:34:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:38:46 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2484.973353499754,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 64822.7314310002,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:08:19 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:47:20 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 118.97007295466271,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 42262.49112645533,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:08:19 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:47:20 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 83.28220599969427,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 42000.16301999995,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:08:19 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:47:20 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18.468503229402156,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 100.80040609375101,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:08:19 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:47:20 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.58658233498964,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 103.00390717133526,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:08:19 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:47:20 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11867.508392499985,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6158.610826499853,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:14:49 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:56:22 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 197.41663807199137,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 103.86893211331275,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:14:49 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:56:22 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 150.46568049956477,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.60312150006575,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:14:49 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:56:22 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 91.45414866305231,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34.851245836175835,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:14:49 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:56:22 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 93.95484754743093,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.351281267023836,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:14:49 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:56:22 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5143.1367664999925,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1960.810206000133,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:52:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:06:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 94.24412510669451,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 96.944098916656,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:52:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:06:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 70.25308599986602,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 62.437391000457865,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:52:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:06:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.44000860144758,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.507841379162667,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:52:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:06:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 31.029967242201604,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.779278380467078,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:52:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:06:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2017.1134680003888,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6102.325008500088,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:56:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:31:57 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.84266767330823,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 126.6733946866619,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:56:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:31:57 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 41.14306449992,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 79.94268999993892,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:56:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:31:57 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.477915355710886,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.903677714071506,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:56:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:31:57 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.574823246872853,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.694808280519105,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:56:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:31:57 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2402.1529614997235,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2055.765110500033,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:42:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:00:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 114.60082032532955,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 82.66302794668566,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:42:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:00:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 78.58655049994923,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.71838249956272,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:42:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:00:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.448930916745994,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.723184369227475,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:42:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:00:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 15.705658343132942,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.82769511533102,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:42:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:00:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6083.561831499992,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 260015.09256049985,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:53:31 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:27:37 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 103.6075839333292,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 245449.95221155128,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:53:31 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:27:37 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65.40378549993875,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 246880.73067000005,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:53:31 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:27:37 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34.45431691825654,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 70.5629141692095,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:53:31 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:27:37 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34.949767179674346,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.7125993259444,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:53:31 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:27:37 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6001.731613000175,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5494.843467499777,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:00:49 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:39:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 118.3772720366763,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 183.25016152003082,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:00:49 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:39:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 78.91565199997785,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 174.3125165003221,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:00:49 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:39:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.296355154165305,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41.02541022551337,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:00:49 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:39:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.158321523451804,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38.13659364626864,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:00:49 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:39:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
@@ -83314,1350 +83314,1350 @@ window.BENCHMARK_DATA = {
           "timestamp": "2024-05-23T22:20:55Z",
           "url": "https://github.com/neuralmagic/nm-vllm/commit/a6b94433cc411da4e03724d54c5b158a24cfc6b3"
         },
-        "date": 1716796264101,
+        "date": 1716884889708,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6090.712625499691,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16201.605522999671,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:06:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:56:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 121.75989160935812,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 211.75247188932855,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:06:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:56:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 92.22087349917274,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 173.3825245000844,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:06:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:56:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 55.08633192431127,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 131.57625620982662,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:06:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:56:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 48.06244881207504,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 138.15701727392738,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:06:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:56:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2501.6659124999023,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19580.348333499387,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:46:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 05:35:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 117.83828453599442,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1972.5583384559795,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:46:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 05:35:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 82.57532300012826,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 304.52867400072137,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:46:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 05:35:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18.668762762910536,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 159.21254493580938,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:46:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 05:35:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.743856165047696,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 147.1007761496972,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:46:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 05:35:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1971.930427499501,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6395.053391500028,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:34:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:24:55 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.11258741336253,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 111.79965263333163,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:34:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:24:55 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.800313499654294,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.57267850011795,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:34:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:24:55 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.239623345137028,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.45611448358153,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:34:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:24:55 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.338959846483151,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37.183693710455756,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:34:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:24:55 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2650.326251000479,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6090.38060700027,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:12:44 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:05:22 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 121.98484990135087,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 121.32542474666631,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:12:44 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:05:22 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 85.89696350009035,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 81.89052250008899,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:12:44 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:05:22 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19.95477410125936,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.143692672131074,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:12:44 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:05:22 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.90629757986169,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.69986765979999,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:12:44 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:05:22 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1870.622367499891,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69626.14286649931,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:40:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 05:02:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 93.77259321667832,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31168.30876382667,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:40:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 05:02:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59.49170350004351,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 30344.20343200054,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:40:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 05:02:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.819426943181075,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 191.61807984477645,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:40:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 05:02:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.086471520709258,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 202.43252203435824,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:40:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 05:02:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16360.486754500016,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2051.071790500373,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:53:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 04:03:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 214.87865147801796,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 82.49340154662302,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:53:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 04:03:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 172.61797999981354,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.705377999984194,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:53:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 04:03:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 133.23870575233246,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.760500238520041,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:53:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 04:03:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 139.48319644772516,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.848424723985891,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:53:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 04:03:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 79396.0280839999,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1858.8892060001854,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:12:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:42:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59363.842118681336,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 93.7504453600286,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:12:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:42:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 72635.35564699987,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 60.39118899980167,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:12:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:42:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.99403910563898,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.863294365573847,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:12:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:42:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.204830952399,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.124532837536922,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:12:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:42:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3683.4174590003386,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6160.110381499862,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:29:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:57:33 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 143.18027123327434,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 103.44071540000793,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:29:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:57:33 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 94.91020449968346,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.7529545000416,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:29:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:57:33 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.714879681573194,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34.84408546641255,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:29:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:57:33 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 23.938975221457362,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.29976061199906,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:29:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:57:33 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5144.204072500543,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7953.783899500081,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:57:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:38:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 91.63029272668307,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 157.2286971733347,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:57:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:38:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.391946499607,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 133.54343499997867,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:57:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:38:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.518711974408774,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59.04505810898644,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:57:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:38:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 31.08318313158899,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.62338436575791,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:57:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:38:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19198.749321500145,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1953.3334455004479,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:13:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 04:09:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1733.9371251993132,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 97.69248994335915,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:13:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 04:09:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 288.0548259990974,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 62.62143650019425,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:13:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 04:09:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 159.1794857617087,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.45766893648569,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:13:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 04:09:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 146.9221872172329,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.79064085537104,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 05:13:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 04:09:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6406.453824000038,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2653.223817000253,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:25:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 04:15:55 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 111.01004096666732,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 121.80828239603458,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:25:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 04:15:55 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 71.89030799997909,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 85.46653200028231,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:25:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 04:15:55 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.41407391325501,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19.888232515726404,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:25:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 04:15:55 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37.153699872384635,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.860818240535956,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:25:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 04:15:55 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18101.753664499938,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1959.0576464997866,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:19:38 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:36:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 228.79141934532768,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.11340747995442,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:19:38 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:36:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 181.4439295003467,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38.480169999729696,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:19:38 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:36:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 150.3248974939885,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.237360838286664,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:19:38 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:36:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 154.98063748205632,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.315057310507168,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:19:38 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:36:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6116.125251000085,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5529.696613000851,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:03:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 04:44:42 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 121.95577656666651,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 183.931280946701,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:03:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 04:44:42 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 82.52334149983653,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 172.97463999966567,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:03:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 04:44:42 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.03362274360493,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41.11881991682275,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:03:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 04:44:42 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.68617036866701,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38.249203532127844,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:03:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 04:44:42 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 71091.10926299945,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 259406.86361500047,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:49:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:29:01 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 32103.595689588667,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 245299.80470982537,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:49:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:29:01 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 31386.506036500577,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 246052.46434299988,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:49:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:29:01 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 193.23542843331893,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 70.71551285958857,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:49:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:29:01 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 203.83707845247366,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.99507171601392,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:49:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:29:01 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 7972.799128499901,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 79653.13417749986,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:38:46 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:14:00 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 157.74704809600038,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59612.45751327067,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:38:46 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:14:00 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 131.2453224999217,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 72702.21142749983,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:38:46 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:14:00 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59.0586387482135,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.14168817573746,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:38:46 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:14:00 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.609164657590256,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.53047094928182,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:38:46 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:14:00 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 64822.7314310002,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6115.854938499979,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:47:20 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 05:27:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 42262.49112645533,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 122.14745964796748,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:47:20 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 05:27:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 42000.16301999995,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 90.83024049959931,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:47:20 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 05:27:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 100.80040609375101,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 55.07349761011697,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:47:20 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 05:27:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 103.00390717133526,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 48.15403204615488,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:47:20 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 05:27:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6158.610826499853,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5146.552168999733,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:56:22 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 05:16:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 103.86893211331275,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 92.85134442655058,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:56:22 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 05:16:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.60312150006575,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67.19642650023161,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:56:22 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 05:16:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34.851245836175835,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.59965823925252,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:56:22 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 05:16:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.351281267023836,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31.13763988135345,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:56:22 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 05:16:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1960.810206000133,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2502.701516000343,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:06:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:48:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 96.944098916656,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 117.60287315465757,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:06:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:48:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 62.437391000457865,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 82.2339850001299,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:06:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:48:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.507841379162667,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18.695595869744587,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:06:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:48:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.779278380467078,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.790618902530667,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:06:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:48:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6102.325008500088,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 64031.919593999875,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:31:57 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:47:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 126.6733946866619,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41767.55604205267,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:31:57 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:47:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 79.94268999993892,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40978.205925499875,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:31:57 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:47:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.903677714071506,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 100.63790912789109,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:31:57 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:47:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.694808280519105,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 102.86352524807036,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 02:31:57 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:47:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2055.765110500033,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 3670.86116649989,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:00:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 04:33:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 82.66302794668566,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 143.0698215466873,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:00:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 04:33:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.71838249956272,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 92.82754149990069,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:00:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 04:33:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.723184369227475,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.60844089167771,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:00:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 04:33:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.82769511533102,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 23.943433647832865,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 04:00:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 04:33:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 260015.09256049985,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6097.099796500061,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:27:37 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:32:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 245449.95221155128,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 128.64881523998747,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:27:37 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:32:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 246880.73067000005,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 85.4196249999859,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:27:37 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:32:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 70.5629141692095,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.767088579470645,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:27:37 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:32:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.7125993259444,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.68399358107581,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-27 03:27:37 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:32:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5494.843467499777,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17989.83830050065,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:39:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 04:22:49 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 183.25016152003082,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 228.6407266526506,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:39:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 04:22:49 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 174.3125165003221,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 182.07948700000998,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:39:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 04:22:49 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 41.02541022551337,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 149.10081188543847,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:39:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 04:22:49 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38.13659364626864,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 153.7965519629422,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-27 04:39:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 04:22:49 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
       {
         "commit": {
           "author": {
-            "name": "dhuangnm",
-            "username": "dhuangnm",
-            "email": "74931910+dhuangnm@users.noreply.github.com"
+            "name": "Andy Linfoot",
+            "username": "andy-neuma",
+            "email": "78757007+andy-neuma@users.noreply.github.com"
           },
           "committer": {
             "name": "GitHub",
             "username": "web-flow",
             "email": "noreply@github.com"
           },
-          "id": "a6b94433cc411da4e03724d54c5b158a24cfc6b3",
-          "message": "update install commands (#264)\n\nUse nm pypi to install.\r\n\r\n---------\r\n\r\nCo-authored-by: dhuangnm <dhuang@MacBook-Pro-2.local>",
-          "timestamp": "2024-05-23T22:20:55Z",
-          "url": "https://github.com/neuralmagic/nm-vllm/commit/a6b94433cc411da4e03724d54c5b158a24cfc6b3"
+          "id": "a160eb945554fe6581427558577c28a07577d53b",
+          "message": "switch benchmarking and testing jobs to run using \"test\" label (#273)\n\nSUMMARY:\r\n* update benchmarking, testing, and accuracy jobs to run on label\r\n`aws-test-a10g-24G` or `aws-test-a10-96G` which is based on \"vanilla\r\ndeeplearning\" AMI\r\n* update relevant GHA actions and workflows to not be dependent on\r\n`pyenv` virtualenv\r\n* update \"model cache\" to use local disk as opposed to \"EFS\"\r\n\r\nTEST PLAN:\r\nruns on remote push\r\n\r\n---------\r\n\r\nCo-authored-by: andy-neuma <andy@neuralmagic.com>\r\nCo-authored-by: Domenic Barbuzzi <domenic@neuralmagic.com>",
+          "timestamp": "2024-05-30T23:47:14Z",
+          "url": "https://github.com/neuralmagic/nm-vllm/commit/a160eb945554fe6581427558577c28a07577d53b"
         },
-        "date": 1716884889708,
+        "date": 1717139962713,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16201.605522999671,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1944.325737500094,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:56:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:27:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 211.75247188932855,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.34725278669308,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:56:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:27:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 173.3825245000844,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.81818549982563,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:56:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:27:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 131.57625620982662,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.076899806348088,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:56:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:27:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 138.15701727392738,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.171032847039678,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:56:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:27:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19580.348333499387,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 3637.295575499593,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 05:35:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:24:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1972.5583384559795,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 142.26459940323667,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 05:35:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:24:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 304.52867400072137,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 92.88423449925176,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 05:35:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:24:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 159.21254493580938,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.2629330304988,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 05:35:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:24:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 147.1007761496972,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 23.54240381698151,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 05:35:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:24:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6395.053391500028,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1918.2469320003293,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:24:55 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:58:27 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 111.79965263333163,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 95.97013177663636,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:24:55 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:58:27 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.57267850011795,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59.28486299990254,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:24:55 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:58:27 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.45611448358153,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.22190179143599,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:24:55 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:58:27 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37.183693710455756,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.48989199564831,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:24:55 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:58:27 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6090.38060700027,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2557.2039315002257,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:05:22 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:04:44 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 121.32542474666631,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 120.180840850661,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:05:22 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:04:44 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 81.89052250008899,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 83.95183950005958,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:05:22 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:04:44 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.143692672131074,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18.851389004810983,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:05:22 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:04:44 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.69986765979999,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.897805321379792,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:05:22 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:04:44 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69626.14286649931,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6099.552767499972,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 05:02:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:50:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 31168.30876382667,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 103.5047685733419,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 05:02:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:50:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 30344.20343200054,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.46140199991169,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 05:02:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:50:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 191.61807984477645,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34.55624090990519,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 05:02:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:50:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 202.43252203435824,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.06870531956898,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 05:02:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:50:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2051.071790500373,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11868.245605000084,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 04:03:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:45:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 82.49340154662302,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 193.44991913400673,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 04:03:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:45:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.705377999984194,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 152.2072479997405,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 04:03:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:45:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.760500238520041,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 91.0558582996331,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 04:03:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:45:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.848424723985891,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 93.73852507711578,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 04:03:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:45:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1858.8892060001854,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13403.280489499593,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:42:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:11:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 93.7504453600286,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 204.9766981546703,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:42:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:11:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 60.39118899980167,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 157.49696250031775,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:42:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:11:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.863294365573847,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 103.17390393887145,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:42:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:11:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.124532837536922,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 106.09828485727913,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:42:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:11:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6160.110381499862,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 56598.711176999816,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:57:33 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:41:00 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 103.44071540000793,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34986.424011133335,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:57:33 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:41:00 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.7529545000416,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34492.74494500003,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:57:33 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:41:00 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34.84408546641255,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 96.16211967293555,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:57:33 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:41:00 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.29976061199906,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 98.01024498023664,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:57:33 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:41:00 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 7953.783899500081,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7510.366780500021,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:38:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:32:43 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 157.2286971733347,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 154.15236881733472,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:38:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:32:43 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 133.54343499997867,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 125.7449084999962,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:38:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:32:43 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59.04505810898644,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 55.98934003479815,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:38:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:32:43 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.62338436575791,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 55.47155360192235,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:38:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:32:43 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1953.3334455004479,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 73175.14265900014,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 04:09:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:05:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 97.69248994335915,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 54376.65658664667,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 04:09:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:05:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 62.62143650019425,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65844.2635580002,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 04:09:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:05:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.45766893648569,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.31190011712384,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 04:09:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:05:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.79064085537104,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 63.81002672030278,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 04:09:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:05:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2653.223817000253,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5988.901054000053,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 04:15:55 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:25:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 121.80828239603458,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 127.66245276332938,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 04:15:55 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:25:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 85.46653200028231,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 84.23423700003241,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 04:15:55 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:25:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19.888232515726404,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.098250105891715,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 04:15:55 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:25:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.860818240535956,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.04777014300449,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 04:15:55 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:25:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1959.0576464997866,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6070.1711980000255,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:36:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:57:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.11340747995442,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 120.75487558334467,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:36:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:57:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38.480169999729696,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 82.90729949999331,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:36:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:57:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.237360838286664,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.47720721275356,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:36:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:57:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.315057310507168,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.12483052955296,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:36:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:57:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5529.696613000851,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1837.0898674997989,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 04:44:42 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:33:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 183.931280946701,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 92.4850439266326,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 04:44:42 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:33:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 172.97463999966567,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 56.32419250014209,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 04:44:42 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:33:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 41.11881991682275,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.571876431442663,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 04:44:42 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:33:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38.249203532127844,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.806236899356767,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 04:44:42 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:33:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 259406.86361500047,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13168.645871999615,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:29:01 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:07:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 245299.80470982537,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 165.8596725520183,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:29:01 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:07:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 246052.46434299988,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 130.18640399968717,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:29:01 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:07:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 70.71551285958857,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 117.19142559224628,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:29:01 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:07:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.99507171601392,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 105.56268794302693,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:29:01 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:07:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 79653.13417749986,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2387.404865499775,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:14:00 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:39:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59612.45751327067,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 115.58159378532824,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:14:00 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:39:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 72702.21142749983,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 81.98823249995257,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:14:00 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:39:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.14168817573746,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.715592278812036,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:14:00 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:39:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.53047094928182,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.002970293327937,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:14:00 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:39:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6115.854938499979,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 57220.16011000051,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 05:27:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:41:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 122.14745964796748,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24262.49963393664,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 05:27:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:41:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 90.83024049959931,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 22467.024267500165,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 05:27:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:41:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 55.07349761011697,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 180.6796389991773,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 05:27:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:41:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 48.15403204615488,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 191.46666942799854,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 05:27:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:41:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5146.552168999733,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5815.404029999627,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 05:16:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:00:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 92.85134442655058,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 119.14034139065431,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 05:16:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:00:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 67.19642650023161,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 89.83753450047516,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 05:16:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:00:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.59965823925252,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 52.66757137009773,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 05:16:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:00:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 31.13763988135345,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 45.88396688034135,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 05:16:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:00:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2502.701516000343,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5082.756575499843,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:48:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:52:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 117.60287315465757,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 90.22497969325438,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:48:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:52:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 82.2339850001299,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65.9079714996551,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:48:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:52:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18.695595869744587,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.283243296958865,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:48:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:52:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.790618902530667,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 30.845509096645735,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 03:48:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:52:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 64031.919593999875,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 250019.64793699994,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:47:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:20:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 41767.55604205267,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 235926.69918582065,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:47:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:20:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40978.205925499875,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 237063.49971650002,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:47:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:20:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 100.63790912789109,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.50191722485226,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:47:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:20:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 102.86352524807036,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65.43693336512705,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:47:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:20:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3670.86116649989,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6359.681814499936,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 04:33:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:19:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 143.0698215466873,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 110.18178417332972,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 04:33:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:19:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 92.82754149990069,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.56043249996446,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 04:33:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:19:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.60844089167771,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.18134608225374,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 04:33:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:19:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 23.943433647832865,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.889334661983305,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-28 04:33:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:19:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6097.099796500061,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5235.807850499441,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:32:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:32:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 128.64881523998747,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 181.38514000669238,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:32:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:32:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 85.4196249999859,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 171.369899999263,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:32:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:32:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.767088579470645,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.002259343469056,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:32:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:32:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.68399358107581,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.27258014968854,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 02:32:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:32:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17989.83830050065,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2030.2278369995292,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 04:22:49 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:52:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 228.6407266526506,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.64925478003109,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 04:22:49 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:52:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 182.07948700000998,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.549836000333016,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 04:22:49 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:52:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 149.10081188543847,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.573610354072041,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 04:22:49 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:52:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 153.7965519629422,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.648292482642121,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.3.0\",\n    \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-28 04:22:49 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:52:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
@@ -84678,668 +84678,668 @@ window.BENCHMARK_DATA = {
           "timestamp": "2024-05-30T23:47:14Z",
           "url": "https://github.com/neuralmagic/nm-vllm/commit/a160eb945554fe6581427558577c28a07577d53b"
         },
-        "date": 1717139962713,
+        "date": 1717140262706,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1944.325737500094,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 3681.097016500189,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:27:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:25:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.34725278669308,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 145.07798170667533,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:27:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:25:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.81818549982563,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 94.51540750069398,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:27:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:25:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.076899806348088,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.4821328702638,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:27:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:25:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.171032847039678,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 23.905893282612915,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:27:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:25:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3637.295575499593,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6092.723453499957,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:24:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:59:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 142.26459940323667,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 121.56179224000728,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:24:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:59:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 92.88423449925176,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 79.33099699994273,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:24:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:59:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.2629330304988,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.01282205005549,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:24:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:59:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 23.54240381698151,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.60662206821262,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:24:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:59:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1918.2469320003293,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 77949.97632699984,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:58:27 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:08:29 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 95.97013177663636,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58397.31349270267,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:58:27 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:08:29 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59.28486299990254,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 71317.3302465002,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:58:27 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:08:29 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.22190179143599,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.33699641951138,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:58:27 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:08:29 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.48989199564831,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65.32680799715409,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:58:27 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:08:29 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2557.2039315002257,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16155.477776499993,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:04:44 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:48:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 120.180840850661,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 213.6813949946748,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:04:44 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:48:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 83.95183950005958,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 166.9866280003589,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:04:44 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:48:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18.851389004810983,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 130.65772220447064,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:04:44 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:48:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.897805321379792,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 136.31033031322832,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:04:44 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:48:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6099.552767499972,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5469.732399999884,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:50:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:33:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 103.5047685733419,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 181.56679687196933,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:50:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:33:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.46140199991169,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 170.89766099979897,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:50:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:33:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34.55624090990519,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.890397382191004,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:50:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:33:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.06870531956898,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38.03514549061607,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:50:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:33:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11868.245605000084,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6405.0570609999795,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:45:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:21:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 193.44991913400673,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 110.46892652666808,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:45:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:21:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 152.2072479997405,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 70.12654699997256,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:45:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:21:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 91.0558582996331,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.48747596856293,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:45:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:21:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 93.73852507711578,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37.15420376744941,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:45:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:21:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13403.280489499593,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69819.26233450031,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:11:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:44:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 204.9766981546703,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31218.575220786664,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:11:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:44:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 157.49696250031775,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 30551.620906000608,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:11:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:44:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 103.17390393887145,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 191.78076755449828,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:11:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:44:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 106.09828485727913,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 202.4536070982777,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:11:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:44:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 56598.711176999816,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1864.540482999928,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:41:00 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:36:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34986.424011133335,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 93.86396844667918,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:41:00 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:36:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34492.74494500003,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 61.090040000181034,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:41:00 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:36:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 96.16211967293555,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.848978666162784,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:41:00 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:36:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 98.01024498023664,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.125742792799205,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:41:00 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:36:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 7510.366780500021,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1952.8336919997855,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:32:43 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:01:38 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 154.15236881733472,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 96.46964666999945,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:32:43 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:01:38 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 125.7449084999962,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 60.690230000091105,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:32:43 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:01:38 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 55.98934003479815,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.52746929280426,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:32:43 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:01:38 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 55.47155360192235,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.802806236467346,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:32:43 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:01:38 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 73175.14265900014,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65052.52764399995,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:05:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:43:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 54376.65658664667,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 42204.86256041533,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:05:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:43:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65844.2635580002,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41962.35900250008,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:05:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:43:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.31190011712384,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 100.4298790739846,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:05:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:43:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 63.81002672030278,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 102.9204985423722,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:05:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:43:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5988.901054000053,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17808.83305749967,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:25:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:14:37 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 127.66245276332938,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 233.17962850800117,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:25:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:14:37 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 84.23423700003241,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 185.84594550020483,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:25:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:14:37 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.098250105891715,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 147.34437494057235,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:25:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:14:37 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.04777014300449,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 152.39974601979338,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:25:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:14:37 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6070.1711980000255,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 257918.9633305002,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:57:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:23:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 120.75487558334467,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 243078.29471475002,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:57:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:23:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 82.90729949999331,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 244911.33549849998,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:57:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:23:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.47720721275356,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69.08091762761569,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:57:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:23:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.12483052955296,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.5571945066391,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:57:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:23:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1837.0898674997989,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5137.703709498965,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:33:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:54:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 92.4850439266326,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 93.3516359397375,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:33:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:54:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 56.32419250014209,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.1904954995407,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:33:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:54:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.571876431442663,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.53577292241982,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:33:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:54:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.806236899356767,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31.10265720510569,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:33:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:54:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13168.645871999615,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6097.918027500782,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:07:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:02:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 165.8596725520183,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 122.9422584399678,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:07:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:02:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 130.18640399968717,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 93.72214050017647,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:07:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:02:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 117.19142559224628,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 55.01082736972813,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:07:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:02:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 105.56268794302693,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 47.926743500999976,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:07:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:02:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2387.404865499775,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6231.304446499735,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:39:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:52:45 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 115.58159378532824,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 104.27052092667206,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:39:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:52:45 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 81.98823249995257,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69.09737800015137,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:39:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:52:45 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.715592278812036,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.07329957322707,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:39:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:52:45 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.002970293327937,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.51991764659937,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:39:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:52:45 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 57220.16011000051,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1966.5706364999096,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:41:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:30:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24262.49963393664,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 81.57006852665896,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:41:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:30:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 22467.024267500165,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41.585513499740046,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:41:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:30:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 180.6796389991773,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.250328231535653,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:41:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:30:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 191.46666942799854,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.308544198746658,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:41:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:30:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5815.404029999627,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19427.53360250026,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:00:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:09:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 119.14034139065431,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1855.6937588053127,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:00:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:09:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 89.83753450047516,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 296.91589050071343,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:00:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:09:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 52.66757137009773,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 159.95018913828906,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:00:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:09:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 45.88396688034135,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 147.70638199244252,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:00:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:09:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5082.756575499843,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2056.6150060003565,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:52:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:55:36 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 90.22497969325438,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 81.96489467331655,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:52:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:55:36 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65.9079714996551,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.91846550025002,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:52:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:55:36 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.283243296958865,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.71997602230433,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:52:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:55:36 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 30.845509096645735,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.823541390495773,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:52:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:55:36 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 250019.64793699994,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7941.3647344999845,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:20:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:34:40 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 235926.69918582065,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 157.8378843453329,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:20:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:34:40 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 237063.49971650002,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 132.93674150008883,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:20:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:34:40 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.50191722485226,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.73406646219327,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:20:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:34:40 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65.43693336512705,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.449693636550265,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:20:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:34:40 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6359.681814499936,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2634.1090030000487,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:19:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:07:44 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 110.18178417332972,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 122.19483824533867,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:19:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:07:44 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.56043249996446,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 83.74080150042573,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:19:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:07:44 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.18134608225374,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19.89030300157944,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:19:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:07:44 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.889334661983305,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.869823937415298,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:19:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:07:44 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5235.807850499441,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2499.371483000232,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:32:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:42:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 181.38514000669238,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 116.43903942266479,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:32:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:42:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 171.369899999263,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 82.38136549925912,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:32:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:42:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.002259343469056,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18.588815181987712,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:32:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:42:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.27258014968854,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.76198581413518,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:32:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:42:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2030.2278369995292,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6090.403449000064,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:52:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:27:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.64925478003109,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 128.24602427333352,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:52:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:27:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.549836000333016,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 83.39523949996419,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:52:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:27:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.573610354072041,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.83419598811897,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:52:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:27:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.648292482642121,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.633694117844954,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:52:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:27:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
@@ -85360,1350 +85360,1350 @@ window.BENCHMARK_DATA = {
           "timestamp": "2024-05-30T23:47:14Z",
           "url": "https://github.com/neuralmagic/nm-vllm/commit/a160eb945554fe6581427558577c28a07577d53b"
         },
-        "date": 1717140262706,
+        "date": 1717140353849,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3681.097016500189,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6101.936902499801,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:25:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:03:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 145.07798170667533,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 123.02423784131679,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:25:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:03:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 94.51540750069398,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 87.86930899896106,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:25:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:03:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.4821328702638,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 54.97718879974617,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:25:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:03:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 23.905893282612915,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 47.95826503696106,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:25:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:03:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6092.723453499957,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19787.037736000457,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:59:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:11:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 121.56179224000728,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2235.3907888513045,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:59:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:11:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 79.33099699994273,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 372.0090970000456,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:59:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:11:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.01282205005549,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 160.99269413570408,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:59:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:11:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.60662206821262,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 148.18166033948762,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:59:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:11:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 77949.97632699984,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18575.993863500116,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:08:29 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:16:14 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58397.31349270267,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 235.9791442813427,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:08:29 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:16:14 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 71317.3302465002,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 188.92866600026537,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:08:29 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:16:14 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.33699641951138,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 153.52913859861425,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:08:29 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:16:14 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65.32680799715409,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 157.80588856760255,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:08:29 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:16:14 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16155.477776499993,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2633.929095500207,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:48:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:09:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 213.6813949946748,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 122.34896825732964,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:48:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:09:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 166.9866280003589,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 85.49028099969291,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:48:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:09:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 130.65772220447064,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19.880498046451304,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:48:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:09:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 136.31033031322832,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.826315582132132,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:48:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:09:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5469.732399999884,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5154.42712749973,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:33:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:56:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 181.56679687196933,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 92.0732080667949,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:33:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:56:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 170.89766099979897,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65.8526425004311,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:33:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:56:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.890397382191004,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.63378935697634,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:33:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:56:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38.03514549061607,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31.18654722510851,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:33:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:56:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6405.0570609999795,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5510.173583000324,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:21:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:35:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 110.46892652666808,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 183.90581848666866,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:21:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:35:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 70.12654699997256,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 173.44350299936195,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:21:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:35:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.48747596856293,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41.12222401973724,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:21:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:35:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37.15420376744941,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38.3146013820182,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:21:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:35:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69819.26233450031,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6160.090393999781,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:44:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:53:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 31218.575220786664,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 104.7565511466837,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:44:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:53:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 30551.620906000608,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67.41715999987719,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:44:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:53:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 191.78076755449828,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34.85193974046793,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:44:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:53:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 202.4536070982777,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.32121772631665,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:44:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:53:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1864.540482999928,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1968.2251990002442,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:36:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:02:43 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 93.86396844667918,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 97.58126243000636,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:36:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:02:43 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 61.090040000181034,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59.73574000017834,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:36:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:02:43 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.848978666162784,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.535554081098677,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:36:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:02:43 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.125742792799205,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.841490286731958,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:36:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:02:43 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1952.8336919997855,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2500.1692185001048,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:01:38 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:42:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 96.46964666999945,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 117.6543817773151,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:01:38 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:42:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 60.690230000091105,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 81.94254399950296,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:01:38 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:42:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.52746929280426,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18.658654500287465,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:01:38 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:42:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.802806236467346,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.88008315499938,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:01:38 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:42:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65052.52764399995,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16548.779949500386,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:43:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:49:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 42204.86256041533,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 217.80040674933417,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:43:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:49:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 41962.35900250008,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 171.68737250040067,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:43:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:49:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 100.4298790739846,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 135.78960418614363,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:43:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:49:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 102.9204985423722,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 141.47805946059194,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:43:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:49:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17808.83305749967,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6403.514041999983,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:14:37 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:22:31 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 233.17962850800117,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 111.89103688666287,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:14:37 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:22:31 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 185.84594550020483,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 71.60771150000755,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:14:37 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:22:31 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 147.34437494057235,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.45230292008293,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:14:37 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:22:31 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 152.39974601979338,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37.14832892103383,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:14:37 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:22:31 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 257918.9633305002,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1967.7691030005917,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:23:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:30:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 243078.29471475002,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 81.52202370666905,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:23:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:30:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 244911.33549849998,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38.060174000293046,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:23:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:30:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69.08091762761569,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.238786017536206,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:23:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:30:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.5571945066391,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.315820831757966,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:23:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:30:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5137.703709498965,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 79659.19065699996,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:54:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:09:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 93.3516359397375,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59643.81296608933,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:54:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:09:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.1904954995407,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 72820.22716649999,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:54:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:09:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.53577292241982,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67.75460141925505,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:54:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:09:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 31.10265720510569,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65.82052626066309,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:54:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:09:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6097.918027500782,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 3681.1479669995606,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:02:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:26:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 122.9422584399678,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 143.8134585365939,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:02:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:26:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 93.72214050017647,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 94.1684270001133,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:02:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:26:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 55.01082736972813,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.65994576060037,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:02:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:26:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 47.926743500999976,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 23.927389066522032,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:02:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:26:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6231.304446499735,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6075.782038499938,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:52:45 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:00:46 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 104.27052092667206,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 121.86152282999653,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:52:45 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:00:46 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69.09737800015137,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 78.82327049992455,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:52:45 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:00:46 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.07329957322707,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.00520032747418,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:52:45 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:00:46 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.51991764659937,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.601370668698856,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:52:45 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:00:46 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1966.5706364999096,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1854.4511349996355,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:30:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:36:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 81.57006852665896,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 94.31250808664421,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:30:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:36:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 41.585513499740046,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 61.06378100002985,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:30:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:36:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.250328231535653,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.87062517806835,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:30:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:36:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.308544198746658,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.122588032508888,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:30:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:36:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19427.53360250026,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6109.639319000053,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:09:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:28:59 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1855.6937588053127,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 125.6489557500087,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:09:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:28:59 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 296.91589050071343,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 85.53572450000502,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:09:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:28:59 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 159.95018913828906,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.853244697887696,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:09:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:28:59 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 147.70638199244252,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.62685607274077,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:09:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:28:59 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2056.6150060003565,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 258289.11931399966,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:55:36 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:24:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 81.96489467331655,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 243556.90976791063,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:55:36 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:24:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.91846550025002,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 245145.88887850026,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:55:36 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:24:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.71997602230433,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 70.76708780687586,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:55:36 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:24:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.823541390495773,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67.04582460958845,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:55:36 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:24:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 7941.3647344999845,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7979.495186500003,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:34:40 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:35:48 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 157.8378843453329,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 158.29165735467421,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:34:40 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:35:48 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 132.93674150008883,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 128.5408804999406,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:34:40 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:35:48 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.73406646219327,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.75648220407465,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:34:40 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:35:48 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.449693636550265,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.42222664757649,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:34:40 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:35:48 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2634.1090030000487,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2061.239989999649,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:07:44 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:56:27 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 122.19483824533867,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 82.57219613999648,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:07:44 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:56:27 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 83.74080150042573,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.54033799982426,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:07:44 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:56:27 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19.89030300157944,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.749096802673108,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:07:44 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:56:27 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.869823937415298,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.85373386428935,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:07:44 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:56:27 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2499.371483000232,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69805.44086600002,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:42:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:45:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 116.43903942266479,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31145.775594269337,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:42:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:45:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 82.38136549925912,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 30031.524023999737,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:42:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:45:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18.588815181987712,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 192.17454173585992,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:42:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:45:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.76198581413518,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 202.79648379991565,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:42:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:45:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6090.403449000064,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 63371.482394500046,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:27:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:44:20 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 128.24602427333352,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40907.49880209133,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:27:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:44:20 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 83.39523949996419,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40442.42481850006,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:27:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:44:20 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.83419598811897,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 99.60641739782716,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:27:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:44:20 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.633694117844954,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 102.15978282274332,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:27:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:44:20 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
       {
         "commit": {
           "author": {
-            "name": "Andy Linfoot",
-            "username": "andy-neuma",
-            "email": "78757007+andy-neuma@users.noreply.github.com"
+            "name": "Domenic Barbuzzi",
+            "username": "dbarbuzzi",
+            "email": "dbarbuzzi@gmail.com"
           },
           "committer": {
             "name": "GitHub",
             "username": "web-flow",
             "email": "noreply@github.com"
           },
-          "id": "a160eb945554fe6581427558577c28a07577d53b",
-          "message": "switch benchmarking and testing jobs to run using \"test\" label (#273)\n\nSUMMARY:\r\n* update benchmarking, testing, and accuracy jobs to run on label\r\n`aws-test-a10g-24G` or `aws-test-a10-96G` which is based on \"vanilla\r\ndeeplearning\" AMI\r\n* update relevant GHA actions and workflows to not be dependent on\r\n`pyenv` virtualenv\r\n* update \"model cache\" to use local disk as opposed to \"EFS\"\r\n\r\nTEST PLAN:\r\nruns on remote push\r\n\r\n---------\r\n\r\nCo-authored-by: andy-neuma <andy@neuralmagic.com>\r\nCo-authored-by: Domenic Barbuzzi <domenic@neuralmagic.com>",
-          "timestamp": "2024-05-30T23:47:14Z",
-          "url": "https://github.com/neuralmagic/nm-vllm/commit/a160eb945554fe6581427558577c28a07577d53b"
+          "id": "019467500eee9542833aedd6e2c17bd62ffa4b40",
+          "message": "Handle server startup failure in __enter__ (#274)\n\nWith a context manager class, the `__exit__` method is not called when\r\nan exception is raised during the context manager’s `__enter__` method.\r\nThis PR addresses that by manually calling that method if an exception\r\nis raised.",
+          "timestamp": "2024-05-31T14:44:40Z",
+          "url": "https://github.com/neuralmagic/nm-vllm/commit/019467500eee9542833aedd6e2c17bd62ffa4b40"
         },
-        "date": 1717140353849,
+        "date": 1717180644086,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6101.936902499801,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2483.930193499873,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:03:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:57:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 123.02423784131679,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 117.35805675598749,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:03:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:57:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 87.86930899896106,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 84.19903650019478,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:03:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:57:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 54.97718879974617,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18.4518560301072,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:03:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:57:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 47.95826503696106,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.64258402208881,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:03:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:57:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19787.037736000457,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 64156.925899000045,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:11:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 13:58:27 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2235.3907888513045,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41265.56946777867,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:11:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 13:58:27 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 372.0090970000456,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40700.54304000007,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:11:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 13:58:27 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 160.99269413570408,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 99.41443055996379,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:11:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 13:58:27 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 148.18166033948762,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 101.44261223264846,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 05:11:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 13:58:27 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18575.993863500116,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6044.974260001254,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:16:14 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 16:17:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 235.9791442813427,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 122.41275711870792,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:16:14 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 16:17:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 188.92866600026537,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 91.05784200073685,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:16:14 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 16:17:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 153.52913859861425,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 54.7597112437143,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:16:14 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 16:17:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 157.80588856760255,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 47.7405560297403,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:16:14 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 16:17:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2633.929095500207,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5139.528454000356,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:09:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 16:10:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 122.34896825732964,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 92.85536933997113,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:09:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 16:10:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 85.49028099969291,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.92951749937492,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:09:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 16:10:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19.880498046451304,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.51518399288244,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:09:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 16:10:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.826315582132132,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31.096406452598835,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:09:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 16:10:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5154.42712749973,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6079.625620499883,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:56:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:15:19 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 92.0732080667949,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 118.71143561999966,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:56:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:15:19 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65.8526425004311,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 88.30147799994847,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:56:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:15:19 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.63378935697634,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.055339955676246,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:56:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:15:19 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 31.18654722510851,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.657973738382104,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:56:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:15:19 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5510.173583000324,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17561.944785500145,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:35:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:30:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 183.90581848666866,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 227.8076672553434,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:35:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:30:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 173.44350299936195,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 187.1709194997493,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:35:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:30:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 41.12222401973724,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 144.49130965311733,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:35:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:30:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38.3146013820182,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 149.549457722834,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:35:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:30:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6160.090393999781,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6065.366650499982,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:53:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 13:43:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 104.7565511466837,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 128.55514536999408,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:53:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 13:43:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 67.41715999987719,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 83.8598694999746,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:53:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 13:43:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34.85193974046793,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.68262714845876,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:53:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 13:43:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.32121772631665,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.64022278004714,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:53:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 13:43:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1968.2251990002442,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1855.0040115001138,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:02:43 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:51:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 97.58126243000636,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 93.68821522665712,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:02:43 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:51:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59.73574000017834,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59.1674144998251,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:02:43 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:51:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.535554081098677,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.802702506272544,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:02:43 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:51:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.841490286731958,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.06161011869142,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 04:02:43 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:51:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2500.1692185001048,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 257382.11322200004,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:42:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:38:54 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 117.6543817773151,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 242495.9619792253,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:42:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:38:54 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 81.94254399950296,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 244313.38900100012,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:42:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:38:54 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18.658654500287465,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69.75110738836511,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:42:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:38:54 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.88008315499938,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.94275516991033,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:42:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:38:54 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16548.779949500386,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1969.4924140003423,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:49:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:45:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 217.80040674933417,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.35959133329622,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:49:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:45:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 171.68737250040067,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.487208000286046,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:49:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:45:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 135.78960418614363,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.248569927175273,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:49:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:45:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 141.47805946059194,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.295491081985142,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:49:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:45:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6403.514041999983,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 3673.7472699996943,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:22:31 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 15:40:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 111.89103688666287,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 144.2571111866861,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:22:31 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 15:40:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 71.60771150000755,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 95.48925600029179,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:22:31 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 15:40:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.45230292008293,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.44395916409429,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:22:31 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 15:40:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37.14832892103383,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 23.86383695044318,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:22:31 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 15:40:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1967.7691030005917,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68155.64958150026,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:30:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 15:59:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 81.52202370666905,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 30194.675852978682,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:30:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 15:59:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38.060174000293046,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 28690.57850199988,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:30:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 15:59:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.238786017536206,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 190.7608596009059,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:30:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 15:59:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.315820831757966,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 201.7939926306953,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:30:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 15:59:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 79659.19065699996,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6151.039890499987,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:09:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:08:11 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59643.81296608933,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 103.85566856666021,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:09:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:08:11 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 72820.22716649999,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65.88881099969512,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:09:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:08:11 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 67.75460141925505,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34.84323044008759,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:09:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:08:11 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65.82052626066309,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.29530609472412,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:09:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:08:11 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3681.1479669995606,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2053.6112204995334,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:26:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:11:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 143.8134585365939,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 81.4595237199804,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:26:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:11:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 94.1684270001133,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38.40970950022893,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:26:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:11:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.65994576060037,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.703956566091634,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:26:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:11:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 23.927389066522032,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.811263026890341,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:26:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:11:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6075.782038499938,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18710.82966599988,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:00:46 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 16:24:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 121.86152282999653,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1399.1507489806645,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:00:46 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 16:24:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 78.82327049992455,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 249.77231499997288,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:00:46 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 16:24:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.00520032747418,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 157.22021375171903,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:00:46 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 16:24:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.601370668698856,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 144.86078924680373,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:00:46 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 16:24:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1854.4511349996355,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1946.9225399998322,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:36:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:17:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 94.31250808664421,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 96.67453586000799,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:36:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:17:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 61.06378100002985,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 60.83936699997139,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:36:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:17:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.87062517806835,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.44668294658585,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:36:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:17:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.122588032508888,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.679400031987392,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:36:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:17:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6109.639319000053,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80104.1921975002,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:28:59 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:23:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 125.6489557500087,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 60030.959854124,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:28:59 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:23:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 85.53572450000502,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 72977.70491000006,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:28:59 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:23:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.853244697887696,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69.59751619510877,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:28:59 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:23:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.62685607274077,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.67641990006418,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:28:59 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:23:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 258289.11931399966,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6391.792147499984,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:24:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 13:36:41 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 243556.90976791063,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 110.53380199333787,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:24:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 13:36:41 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 245145.88887850026,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69.93591650001463,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:24:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 13:36:41 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 70.76708780687586,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.42738148463032,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:24:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 13:36:41 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 67.04582460958845,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37.14441306690115,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:24:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 13:36:41 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 7979.495186500003,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7917.227143500099,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:35:48 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 13:49:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 158.29165735467421,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 157.27479189600157,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:35:48 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 13:49:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 128.5408804999406,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 132.1375730000227,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:35:48 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 13:49:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.75648220407465,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.54781930375723,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:35:48 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 13:49:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.42222664757649,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.22668530605768,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:35:48 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 13:49:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2061.239989999649,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5409.386085500046,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:56:27 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 15:49:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 82.57219613999648,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 184.59242691737745,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:56:27 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 15:49:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.54033799982426,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 177.04604800019297,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:56:27 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 15:49:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.749096802673108,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.6985874906362,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:56:27 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 15:49:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.85373386428935,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37.919782852467804,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 03:56:27 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 15:49:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69805.44086600002,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2628.640983999958,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:45:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:23:12 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 31145.775594269337,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 121.39504159467955,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:45:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:23:12 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 30031.524023999737,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 84.09308050022446,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:45:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:23:12 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 192.17454173585992,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19.776976989626807,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:45:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:23:12 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 202.79648379991565,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.794335007256045,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 04:45:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:23:12 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 63371.482394500046,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 15499.0497900003,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:44:20 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:04:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40907.49880209133,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 208.6239780786691,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:44:20 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:04:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40442.42481850006,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 167.15830099974482,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:44:20 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:04:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 99.60641739782716,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 124.87639899359934,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:44:20 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:04:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 102.15978282274332,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 130.46732669284876,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 02:44:20 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:04:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
@@ -86724,668 +86724,668 @@ window.BENCHMARK_DATA = {
           "timestamp": "2024-05-31T14:44:40Z",
           "url": "https://github.com/neuralmagic/nm-vllm/commit/019467500eee9542833aedd6e2c17bd62ffa4b40"
         },
-        "date": 1717180644086,
+        "date": 1717226724529,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2483.930193499873,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13433.544474500195,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:57:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:16:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 117.35805675598749,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 209.0220402253411,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:57:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:16:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 84.19903650019478,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 159.21709350004676,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:57:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:16:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18.4518560301072,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 104.42860481970348,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:57:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:16:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.64258402208881,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 107.95830541999374,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:57:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:16:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 64156.925899000045,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6026.254502000256,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 13:58:27 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:02:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 41265.56946777867,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 122.55780948666447,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 13:58:27 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:02:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40700.54304000007,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 78.32719400016686,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 13:58:27 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:02:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 99.41443055996379,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.456075298873564,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 13:58:27 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:02:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 101.44261223264846,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.16743588026698,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 13:58:27 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:02:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6044.974260001254,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 57687.95476100058,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 16:17:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:46:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 122.41275711870792,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24589.487326492675,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 16:17:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:46:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 91.05784200073685,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 23176.755627000603,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 16:17:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:46:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 54.7597112437143,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 181.31298034386248,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 16:17:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:46:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 47.7405560297403,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 192.1832146050638,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 16:17:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:46:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5139.528454000356,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 56125.272259500096,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 16:10:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:45:34 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 92.85536933997113,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34504.20830972532,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 16:10:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:45:34 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.92951749937492,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34041.64477699999,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 16:10:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:45:34 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.51518399288244,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 95.96340996331081,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 16:10:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:45:34 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 31.096406452598835,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 98.10117683706032,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 16:10:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:45:34 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6079.625620499883,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2597.9437940004573,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:15:19 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:09:36 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 118.71143561999966,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 120.2349371866864,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:15:19 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:09:36 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 88.30147799994847,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 83.46986550031943,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:15:19 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:09:36 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.055339955676246,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19.01261998820881,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:15:19 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:09:36 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.657973738382104,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.105187230427656,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:15:19 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:09:36 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17561.944785500145,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2386.4957324999523,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:30:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:44:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 227.8076672553434,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 116.10263050399833,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:30:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:44:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 187.1709194997493,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 81.63464149947686,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:30:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:44:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 144.49130965311733,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.781622552837995,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:30:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:44:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 149.549457722834,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 15.980207944737126,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:30:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:44:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6065.366650499982,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 249056.0881115,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 13:43:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:25:39 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 128.55514536999408,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 234769.65152463395,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 13:43:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:25:39 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 83.8598694999746,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 235907.7511035,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 13:43:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:25:39 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.68262714845876,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69.73109339173806,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 13:43:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:25:39 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.64022278004714,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65.4586457380798,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 13:43:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:25:39 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1855.0040115001138,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6099.945930500098,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:51:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:55:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 93.68821522665712,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 106.01067297334339,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:51:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:55:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59.1674144998251,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69.5510320001631,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:51:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:55:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.802702506272544,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34.51520349307166,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:51:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:55:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.06161011869142,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.0217535809248,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:51:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:55:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 257382.11322200004,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6369.022003499936,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:38:54 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:24:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 242495.9619792253,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 109.86370787999249,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:38:54 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:24:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 244313.38900100012,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67.98295800001597,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:38:54 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:24:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69.75110738836511,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.19853741333624,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:38:54 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:24:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.94275516991033,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.854873681434434,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:38:54 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:24:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1969.4924140003423,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 3643.2665580000503,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:45:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:29:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.35959133329622,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 141.6193678199003,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:45:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:29:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.487208000286046,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 92.86853350022284,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:45:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:29:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.248569927175273,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.360144188083904,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:45:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:29:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.295491081985142,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 23.628681514108088,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:45:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:29:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3673.7472699996943,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5795.081401499374,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 15:40:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:04:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 144.2571111866861,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 119.1639660506286,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 15:40:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:04:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 95.48925600029179,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 86.6784984991682,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 15:40:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:04:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.44395916409429,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 52.65340047924928,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 15:40:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:04:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 23.86383695044318,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 45.83244374329012,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 15:40:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:04:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68155.64958150026,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1941.5869834997466,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 15:59:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:32:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 30194.675852978682,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 78.86718834666681,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 15:59:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:32:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 28690.57850199988,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.243063499725395,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 15:59:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:32:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 190.7608596009059,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.086835355884213,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 15:59:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:32:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 201.7939926306953,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.167377682341602,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 15:59:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:32:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6151.039890499987,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13001.913375500408,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:08:11 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:12:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 103.85566856666021,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 164.54864900934749,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:08:11 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:12:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65.88881099969512,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 132.40938450053363,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:08:11 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:12:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34.84323044008759,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 115.63882724243625,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:08:11 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:12:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.29530609472412,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 104.19328698760164,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:08:11 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:12:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2053.6112204995334,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5079.459464000138,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:11:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:57:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 81.4595237199804,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 91.60726572002507,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:11:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:57:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38.40970950022893,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 64.30312999964372,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:11:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:57:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.703956566091634,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.200136683882086,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:11:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:57:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.811263026890341,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 30.799919584297072,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:11:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:57:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18710.82966599988,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1915.7010099993386,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 16:24:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:03:34 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1399.1507489806645,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 95.33896263003044,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 16:24:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:03:34 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 249.77231499997288,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.913551500154426,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 16:24:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:03:34 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 157.22021375171903,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.154389783356644,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 16:24:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:03:34 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 144.86078924680373,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.435429443586605,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 16:24:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:03:34 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1946.9225399998322,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7518.360590500038,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:17:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:37:19 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 96.67453586000799,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 155.8976876106687,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:17:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:37:19 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 60.83936699997139,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 131.7351630000303,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:17:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:37:19 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.44668294658585,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 56.1581977850762,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:17:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:37:19 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.679400031987392,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 55.68204937520141,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:17:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:37:19 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80104.1921975002,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5995.356360000073,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:23:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:30:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 60030.959854124,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 128.52373509334674,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:23:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:30:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 72977.70491000006,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 86.87627850002855,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:23:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:30:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69.59751619510877,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.286603577508885,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:23:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:30:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.67641990006418,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.21856208176765,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 14:23:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:30:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6391.792147499984,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5242.797461000009,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 13:36:41 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:36:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 110.53380199333787,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 181.47744490133482,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 13:36:41 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:36:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69.93591650001463,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 171.1728239997683,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 13:36:41 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:36:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.42738148463032,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.08456078704941,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 13:36:41 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:36:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37.14441306690115,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.247383541717056,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 13:36:41 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:36:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 7917.227143500099,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1849.2760655003622,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 13:49:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:38:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 157.27479189600157,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 92.93386266001713,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 13:49:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:38:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 132.1375730000227,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 57.14978950027216,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 13:49:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:38:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.54781930375723,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.566374270951178,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 13:49:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:38:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.22668530605768,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.79882193081735,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 13:49:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:38:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5409.386085500046,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 74319.16772249997,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 15:49:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:10:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 184.59242691737745,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 54905.67989840266,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 15:49:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:10:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 177.04604800019297,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67115.6059944999,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 15:49:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:10:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.6985874906362,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.20423739689029,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 15:49:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:10:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37.919782852467804,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 64.20086864799721,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-05-31 15:49:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:10:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2628.640983999958,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11776.748068499728,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:23:12 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:50:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 121.39504159467955,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 192.14932382466395,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:23:12 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:50:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 84.09308050022446,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 151.4051315002689,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:23:12 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:50:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19.776976989626807,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 90.38786759309662,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:23:12 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:50:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.794335007256045,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 93.01644828327463,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:23:12 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:50:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 15499.0497900003,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2038.1226959998457,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:04:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:57:35 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 208.6239780786691,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 81.25553335331158,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:04:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:57:35 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 167.15830099974482,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38.933951999752026,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:04:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:57:35 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 124.87639899359934,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.60494243109782,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:04:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:57:35 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 130.46732669284876,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.695432862760343,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-05-31 15:04:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:57:35 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
@@ -87406,668 +87406,668 @@ window.BENCHMARK_DATA = {
           "timestamp": "2024-05-31T14:44:40Z",
           "url": "https://github.com/neuralmagic/nm-vllm/commit/019467500eee9542833aedd6e2c17bd62ffa4b40"
         },
-        "date": 1717226724529,
+        "date": 1717226809492,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13433.544474500195,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5465.104090500063,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:16:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:36:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 209.0220402253411,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 182.38181604533747,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:16:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:36:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 159.21709350004676,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 172.48556349932187,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:16:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:36:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 104.42860481970348,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41.01814435216436,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:16:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:36:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 107.95830541999374,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38.23224205729735,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:16:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:36:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6026.254502000256,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 64389.11264650028,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:02:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:44:11 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 122.55780948666447,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41904.01268081,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:02:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:44:11 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 78.32719400016686,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41665.827635500136,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:02:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:44:11 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.456075298873564,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 100.50732660572248,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:02:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:44:11 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.16743588026698,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 103.06239928118974,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:02:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:44:11 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 57687.95476100058,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1968.1510505001825,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:46:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:32:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24589.487326492675,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 81.35354207994776,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:46:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:32:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 23176.755627000603,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41.13816049994057,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:46:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:32:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 181.31298034386248,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.212979792208433,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:46:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:32:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 192.1832146050638,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.30842540938585,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:46:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:32:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 56125.272259500096,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6409.06140650003,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:45:34 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:22:19 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34504.20830972532,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 110.28681744666224,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:45:34 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:22:19 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34041.64477699999,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69.82857050002167,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:45:34 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:22:19 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 95.96340996331081,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.45402963019084,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:45:34 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:22:19 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 98.10117683706032,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37.183773724756755,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:45:34 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:22:19 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2597.9437940004573,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5139.758131000235,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:09:36 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:57:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 120.2349371866864,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 91.55492249328138,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:09:36 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:57:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 83.46986550031943,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.3277339999695,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:09:36 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:57:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19.01261998820881,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.51161150505591,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:09:36 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:57:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.105187230427656,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31.122926756589106,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:09:36 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:57:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2386.4957324999523,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1952.253523500076,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:44:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:04:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 116.10263050399833,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 97.06344014667896,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:44:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:04:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 81.63464149947686,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 61.132333999921684,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:44:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:04:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.781622552837995,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.518420134876393,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:44:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:04:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 15.980207944737126,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.81889525331887,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:44:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:04:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 249056.0881115,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 15772.96817550041,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:25:39 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:51:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 234769.65152463395,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 212.41135819334036,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:25:39 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:51:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 235907.7511035,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 168.6653629999455,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:25:39 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:51:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69.73109339173806,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 126.16649284113906,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:25:39 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:51:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65.4586457380798,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 132.16589601969588,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:25:39 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:51:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6099.945930500098,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7947.8744014999165,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:55:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:35:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 106.01067297334339,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 158.44730231334324,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:55:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:35:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69.5510320001631,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 135.29116350002823,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:55:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:35:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34.51520349307166,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.84342651133543,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:55:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:35:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.0217535809248,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.437561346413524,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:55:26 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:35:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6369.022003499936,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 3682.9004324999914,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:24:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:27:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 109.86370787999249,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 144.17348532340233,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:24:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:27:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 67.98295800001597,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 93.40964100010751,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:24:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:27:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.19853741333624,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.450300879533238,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:24:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:27:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.854873681434434,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 23.877735847614165,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:24:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:27:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3643.2665580000503,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6091.999458499913,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:29:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:28:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 141.6193678199003,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 129.25372375333382,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:29:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:28:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 92.86853350022284,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 89.15678899995783,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:29:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:28:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.360144188083904,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.74735498837957,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:29:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:28:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 23.628681514108088,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.65306078308095,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:29:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:28:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5795.081401499374,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80321.45212449995,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:04:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:11:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 119.1639660506286,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 60084.608386216,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:04:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:11:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 86.6784984991682,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 73040.18949800024,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:04:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:11:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 52.65340047924928,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.53753827580553,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:04:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:11:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 45.83244374329012,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.11969253932963,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:04:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:11:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1941.5869834997466,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2639.5517854998616,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:32:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:10:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 78.86718834666681,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 121.02935196665324,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:32:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:10:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.243063499725395,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 83.91525050001292,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:32:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:10:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.086835355884213,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19.807965012021203,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:32:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:10:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.167377682341602,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.892292434902217,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:32:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:10:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13001.913375500408,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2058.8162895001005,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:12:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:58:12 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 164.54864900934749,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 82.24307422668062,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:12:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:58:12 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 132.40938450053363,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.71235049993993,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:12:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:58:12 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 115.63882724243625,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.78980069313554,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:12:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:58:12 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 104.19328698760164,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.875658441741487,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:12:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:58:12 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5079.459464000138,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6143.030510499784,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:57:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:02:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 91.60726572002507,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 123.48958786333544,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:57:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:02:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 64.30312999964372,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 83.64175400015483,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:57:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:02:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.200136683882086,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.00522969330008,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:57:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:02:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 30.799919584297072,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.680604117973,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:57:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:02:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1915.7010099993386,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17717.51661799999,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:03:34 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:17:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 95.33896263003044,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 229.6791376726675,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:03:34 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:17:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.913551500154426,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 184.0309030003482,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:03:34 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:17:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.154389783356644,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 146.70600372650583,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:03:34 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:17:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.435429443586605,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 151.8096421807653,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:03:34 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:17:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 7518.360590500038,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19371.194835000097,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:37:19 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:12:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 155.8976876106687,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1825.0263110479912,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:37:19 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:12:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 131.7351630000303,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 296.2281604995951,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:37:19 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:12:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 56.1581977850762,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 159.1624820135071,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:37:19 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:12:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 55.68204937520141,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 147.04514007011437,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:37:19 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:12:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5995.356360000073,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68792.75241000051,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:30:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:46:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 128.52373509334674,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 30492.831526224,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:30:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:46:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 86.87627850002855,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 29110.252202000083,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:30:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:46:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.286603577508885,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 191.33320536899203,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:30:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:46:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.21856208176765,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 202.79778100221915,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:30:33 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:46:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5242.797461000009,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6151.4509609999095,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:36:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:55:30 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 181.47744490133482,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 106.6259924333493,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:36:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:55:30 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 171.1728239997683,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 70.0594135000756,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:36:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:55:30 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.08456078704941,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34.80231963248332,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:36:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:55:30 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.247383541717056,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.29096479789972,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:36:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:55:30 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1849.2760655003622,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 256516.0198184999,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:38:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:26:30 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 92.93386266001713,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 242469.63911015532,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:38:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:26:30 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 57.14978950027216,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 242928.41760100055,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:38:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:26:30 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.566374270951178,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 70.73875155506106,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:38:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:26:30 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.79882193081735,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.58589442037847,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:38:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:26:30 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 74319.16772249997,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2521.811208499912,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:10:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:45:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 54905.67989840266,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 117.12486600530733,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:10:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:45:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 67115.6059944999,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 82.18184749966895,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:10:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:45:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.20423739689029,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18.673919768941687,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:10:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:45:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 64.20086864799721,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.760043884738042,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:10:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:45:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11776.748068499728,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1856.4668409999285,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:50:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:38:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 192.14932382466395,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 93.92989755330139,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:50:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:38:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 151.4051315002689,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 61.78311549956561,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:50:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:38:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 90.38786759309662,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.772554630902334,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:50:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:38:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 93.01644828327463,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.100548923487823,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:50:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:38:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2038.1226959998457,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6128.83922400124,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:57:35 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:04:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 81.25553335331158,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 121.032034792023,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:57:35 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:04:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38.933951999752026,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 91.85366000019712,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:57:35 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:04:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.60494243109782,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 54.9551760090202,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:57:35 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:04:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.695432862760343,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 47.91666134291712,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:57:35 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:04:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
@@ -88088,668 +88088,668 @@ window.BENCHMARK_DATA = {
           "timestamp": "2024-05-31T14:44:40Z",
           "url": "https://github.com/neuralmagic/nm-vllm/commit/019467500eee9542833aedd6e2c17bd62ffa4b40"
         },
-        "date": 1717226809492,
+        "date": 1717226907541,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5465.104090500063,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2638.090289000047,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:36:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:11:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 182.38181604533747,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 124.13079550531378,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:36:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:11:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 172.48556349932187,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 86.69665100023849,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:36:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:11:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 41.01814435216436,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19.85371453968665,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:36:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:11:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38.23224205729735,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.748642085078284,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:36:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:11:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 64389.11264650028,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17609.569553000256,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:44:11 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:18:21 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 41904.01268081,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 231.22858862998328,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:44:11 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:18:21 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 41665.827635500136,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 186.24312999963877,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:44:11 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:18:21 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 100.50732660572248,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 145.26747173435956,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:44:11 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:18:21 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 103.06239928118974,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 151.15857947976536,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:44:11 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:18:21 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1968.1510505001825,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6160.346129499885,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:32:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:56:15 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 81.35354207994776,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 105.34277093999967,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:32:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:56:15 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 41.13816049994057,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67.88684999992256,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:32:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:56:15 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.212979792208433,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34.80396775220805,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:32:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:56:15 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.30842540938585,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.31792395837699,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:32:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:56:15 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6409.06140650003,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6098.687131000588,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:22:19 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:07:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 110.28681744666224,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 121.37724877200769,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:22:19 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:07:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69.82857050002167,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 89.9813774994982,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:22:19 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:07:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.45402963019084,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 54.648997186359274,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:22:19 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:07:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37.183773724756755,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 47.64992248008313,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:22:19 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:07:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5139.758131000235,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69584.89189899956,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:57:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:48:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 91.55492249328138,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31048.29460035534,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:57:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:48:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.3277339999695,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 30093.43344050012,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:57:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:48:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.51161150505591,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 192.66771929707056,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:57:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:48:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 31.122926756589106,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 203.56901990804454,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:57:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:48:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1952.253523500076,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2483.630244500091,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:04:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:45:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 97.06344014667896,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 117.69600437866029,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:04:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:45:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 61.132333999921684,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 83.64083249989562,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:04:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:45:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.518420134876393,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18.522834716305688,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:04:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:45:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.81889525331887,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.63574917771536,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:04:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:45:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 15772.96817550041,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7919.601063499954,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:51:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:38:27 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 212.41135819334036,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 157.3910337440011,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:51:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:38:27 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 168.6653629999455,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 129.13522700000613,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:51:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:38:27 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 126.16649284113906,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.9569065085812,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:51:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:38:27 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 132.16589601969588,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.51430730106501,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:51:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:38:27 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 7947.8744014999165,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1949.4820319996506,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:35:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:05:10 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 158.44730231334324,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 96.94478731334432,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:35:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:05:10 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 135.29116350002823,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 61.578827499943145,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:35:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:05:10 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.84342651133543,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.49369730497954,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:35:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:05:10 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.437561346413524,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.834836465259444,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:35:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:05:10 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3682.9004324999914,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5395.93860450077,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:27:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:39:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 144.17348532340233,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 181.97643779730666,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:27:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:39:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 93.40964100010751,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 170.70760699971288,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:27:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:39:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.450300879533238,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.696208352486806,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:27:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:39:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 23.877735847614165,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37.85903324170393,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:27:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:39:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6091.999458499913,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1958.2405209998797,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:28:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:33:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 129.25372375333382,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 81.26579094668462,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:28:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:33:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 89.15678899995783,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38.981102999514405,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:28:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:33:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.74735498837957,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.196314056028056,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:28:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:33:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.65306078308095,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.308551286685358,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:28:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:33:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80321.45212449995,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19417.44170650054,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:11:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:15:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 60084.608386216,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1984.1171261193424,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:11:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:15:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 73040.18949800024,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 299.54049000025407,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:11:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:15:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.53753827580553,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 160.0769743161095,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:11:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:15:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.11969253932963,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 147.35185909605494,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:11:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:15:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2639.5517854998616,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 64378.06861699982,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:10:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:47:00 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 121.02935196665324,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41894.84862783465,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:10:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:47:00 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 83.91525050001292,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41211.34732250005,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:10:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:47:00 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19.807965012021203,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 100.44607479219636,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:10:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:47:00 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.892292434902217,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 102.86384090894646,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:10:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:47:00 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2058.8162895001005,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6099.562576000097,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:58:12 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:31:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 82.24307422668062,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 126.94176616332773,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:58:12 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:31:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.71235049993993,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 90.25698550010475,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:58:12 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:31:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.78980069313554,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.78026131030745,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:58:12 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:31:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.875658441741487,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.56709518774522,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:58:12 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:31:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6143.030510499784,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2051.8238935001136,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:02:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:58:55 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 123.48958786333544,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 83.6770349466739,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:02:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:58:55 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 83.64175400015483,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 42.637954500150954,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:02:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:58:55 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.00522969330008,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.685178672366225,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:02:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:58:55 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.680604117973,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.807880624142099,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:02:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:58:55 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17717.51661799999,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1862.6115929996558,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:17:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:39:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 229.6791376726675,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 94.27098721333701,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:17:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:39:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 184.0309030003482,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59.117498499745125,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:17:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:39:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 146.70600372650583,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.844234496377625,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:17:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:39:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 151.8096421807653,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.151565290528257,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:17:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:39:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19371.194835000097,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6394.527255500008,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:12:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:25:10 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1825.0263110479912,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 111.05473383999576,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:12:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:25:10 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 296.2281604995951,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 70.38765350000631,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:12:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:25:10 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 159.1624820135071,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.40613075942994,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:12:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:25:10 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 147.04514007011437,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37.10810124192214,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:12:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:25:10 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68792.75241000051,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6068.195037999885,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:46:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:03:21 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 30492.831526224,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 120.8231543266902,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:46:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:03:21 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 29110.252202000083,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 81.73207999993792,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:46:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:03:21 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 191.33320536899203,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.951069390166126,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:46:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:03:21 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 202.79778100221915,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.54447877075444,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:46:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:03:21 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6151.4509609999095,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 15614.073387499957,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:55:30 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:52:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 106.6259924333493,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 215.67482790600783,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:55:30 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:52:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 70.0594135000756,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 172.90871850036638,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:55:30 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:52:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34.80231963248332,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 126.92911536352358,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:55:30 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:52:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.29096479789972,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 132.3077546218594,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:55:30 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:52:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 256516.0198184999,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5134.5485170004395,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:26:30 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:00:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 242469.63911015532,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 92.24353658669618,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:26:30 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:00:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 242928.41760100055,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 64.83863849916816,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:26:30 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:00:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 70.73875155506106,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.537869481502305,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:26:30 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:00:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.58589442037847,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31.134182925720367,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:26:30 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:00:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2521.811208499912,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 78420.08978000013,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:45:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:11:56 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 117.12486600530733,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58702.017414699985,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:45:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:11:56 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 82.18184749966895,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 72048.7134335001,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:45:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:11:56 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18.673919768941687,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67.3635887082443,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:45:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:11:56 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.760043884738042,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65.80326064012668,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:45:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:11:56 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1856.4668409999285,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 257630.41754999995,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:38:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:26:55 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 93.92989755330139,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 244192.06504996668,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:38:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:26:55 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 61.78311549956561,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 244848.77846500013,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:38:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:26:55 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.772554630902334,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 70.32903447195388,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:38:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:26:55 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.100548923487823,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67.0383085146902,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:38:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:26:55 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6128.83922400124,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 3678.4869454995714,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:04:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:31:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 121.032034792023,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 143.53516155332423,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:04:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:31:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 91.85366000019712,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 93.8544999989972,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:04:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:31:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 54.9551760090202,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.619680555833128,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:04:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:31:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 47.91666134291712,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 23.842106511884577,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:04:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:31:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
@@ -88770,668 +88770,668 @@ window.BENCHMARK_DATA = {
           "timestamp": "2024-05-31T14:44:40Z",
           "url": "https://github.com/neuralmagic/nm-vllm/commit/019467500eee9542833aedd6e2c17bd62ffa4b40"
         },
-        "date": 1717226907541,
+        "date": 1717226920680,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2638.090289000047,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5153.267233499719,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:11:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:59:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 124.13079550531378,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 93.72724149327648,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:11:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:59:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 86.69665100023849,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65.75649099886505,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:11:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:59:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19.85371453968665,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.56325137920866,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:11:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:59:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.748642085078284,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31.135406988289148,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:11:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:59:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17609.569553000256,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6411.88908099997,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:18:21 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:23:50 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 231.22858862998328,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 110.83134152666541,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:18:21 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:23:50 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 186.24312999963877,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.34774449998804,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:18:21 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:23:50 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 145.26747173435956,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.46555021775157,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:18:21 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:23:50 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 151.15857947976536,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37.16765750887564,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:18:21 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:23:50 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6160.346129499885,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5465.795638499003,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:56:15 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:38:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 105.34277093999967,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 182.45962854265963,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:56:15 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:38:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 67.88684999992256,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 171.95940000055998,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:56:15 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:38:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34.80396775220805,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.97504323687587,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:56:15 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:38:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.31792395837699,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38.226725419496674,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:56:15 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:38:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6098.687131000588,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80529.45460850015,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:07:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:11:13 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 121.37724877200769,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 60326.79254720133,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:07:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:11:13 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 89.9813774994982,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 73896.06737299982,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:07:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:11:13 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 54.648997186359274,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.78470235412063,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:07:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:11:13 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 47.64992248008313,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65.91019339517356,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:07:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:11:13 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69584.89189899956,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6095.397781999964,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:48:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:30:18 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 31048.29460035534,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 125.82052522332332,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:48:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:30:18 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 30093.43344050012,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 85.79779149999922,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:48:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:30:18 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 192.66771929707056,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.91476485872919,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:48:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:30:18 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 203.56901990804454,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.6967848006478,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:48:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:30:18 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2483.630244500091,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 259042.47589599958,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:45:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:26:14 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 117.69600437866029,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 244226.45942004398,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:45:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:26:14 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 83.64083249989562,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 245914.3064604996,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:45:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:26:14 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18.522834716305688,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69.77792115304503,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:45:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:26:14 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.63574917771536,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67.17802914457835,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:45:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:26:14 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 7919.601063499954,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1956.863275499927,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:38:27 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:32:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 157.3910337440011,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 79.70074537330827,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:38:27 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:32:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 129.13522700000613,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.54534800050169,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:38:27 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:32:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.9569065085812,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.222250544463073,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:38:27 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:32:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.51430730106501,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.288621271329562,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:38:27 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:32:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1949.4820319996506,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 3676.1369844998626,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:05:10 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:30:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 96.94478731334432,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 144.38824281663983,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:05:10 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:30:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 61.578827499943145,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 95.10873100043682,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:05:10 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:30:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.49369730497954,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.436373215296747,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:05:10 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:30:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.834836465259444,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 23.844668904210636,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:05:10 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:30:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5395.93860450077,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7972.256407499913,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:39:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:37:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 181.97643779730666,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 156.6752595119912,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:39:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:37:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 170.70760699971288,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 130.30707749999237,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:39:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:37:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.696208352486806,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.938903093863736,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:39:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:37:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37.85903324170393,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.552133147019305,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:39:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:37:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1958.2405209998797,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 70102.49670000031,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:33:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:48:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 81.26579094668462,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31426.950578070013,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:33:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:48:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38.981102999514405,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 30875.1951510003,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:33:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:48:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.196314056028056,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 192.71480064987958,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:33:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:48:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.308551286685358,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 203.45817038187403,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:33:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:48:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19417.44170650054,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6055.798128000788,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:15:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:06:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1984.1171261193424,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 121.03516805731974,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:15:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:06:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 299.54049000025407,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 87.93307399992045,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:15:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:06:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 160.0769743161095,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 54.60481309716015,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:15:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:06:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 147.35185909605494,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 47.55947662492116,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:15:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:06:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 64378.06861699982,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2646.620501999678,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:47:00 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:10:40 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 41894.84862783465,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 121.66360963468833,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:47:00 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:10:40 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 41211.34732250005,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 85.93796700006351,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:47:00 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:10:40 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 100.44607479219636,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19.870622147981955,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:47:00 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:10:40 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 102.86384090894646,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.90668610635907,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:47:00 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:10:40 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6099.562576000097,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6173.267183000235,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:31:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:55:27 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 126.94176616332773,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 104.52148489999067,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:31:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:55:27 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 90.25698550010475,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.1482324996523,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:31:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:55:27 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.78026131030745,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34.879281108834356,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:31:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:55:27 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.56709518774522,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.40217023112008,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:31:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:55:27 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2051.8238935001136,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1963.2678400007535,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:58:55 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:04:22 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 83.6770349466739,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 96.80734718000774,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:58:55 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:04:22 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 42.637954500150954,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59.49763450007595,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:58:55 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:04:22 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.685178672366225,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.528203331464441,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:58:55 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:04:22 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.807880624142099,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.879467540563104,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:58:55 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:04:22 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1862.6115929996558,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 15955.817038000077,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:39:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:51:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 94.27098721333701,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 210.95193908932862,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:39:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:51:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59.117498499745125,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 168.29410699983782,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:39:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:51:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.844234496377625,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 129.05553459825703,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:39:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:51:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.151565290528257,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 133.93621821245296,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:39:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:51:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6394.527255500008,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6079.774405500075,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:25:10 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:02:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 111.05473383999576,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 123.21347402667318,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:25:10 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:02:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 70.38765350000631,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 85.48538349987211,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:25:10 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:02:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.40613075942994,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.91949216651743,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:25:10 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:02:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37.10810124192214,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.54900252614391,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:25:10 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:02:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6068.195037999885,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65760.63130699993,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:03:21 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:45:40 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 120.8231543266902,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 42493.21184977733,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:03:21 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:45:40 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 81.73207999993792,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 42642.46320699988,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:03:21 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:45:40 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.951069390166126,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 100.45283879670744,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:03:21 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:45:40 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.54447877075444,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 102.5090991386381,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:03:21 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:45:40 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 15614.073387499957,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19413.51446599947,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:52:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:14:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 215.67482790600783,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1911.2615646006827,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:52:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:14:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 172.90871850036638,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 301.560390999839,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:52:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:14:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 126.92911536352358,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 160.18905535696663,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:52:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:14:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 132.3077546218594,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 148.03388925635815,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:52:10 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:14:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5134.5485170004395,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2055.1594009998553,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:00:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:58:06 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 92.24353658669618,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 82.2215669666851,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:00:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:58:06 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 64.83863849916816,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41.70422399965901,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:00:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:58:06 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.537869481502305,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.804952142759143,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:00:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:58:06 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 31.134182925720367,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.80095409422303,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:00:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:58:06 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 78420.08978000013,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2505.315351000263,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:11:56 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:44:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58702.017414699985,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 116.81682827600282,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:11:56 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:44:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 72048.7134335001,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 79.92226000033043,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:11:56 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:44:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 67.3635887082443,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18.642578608419566,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:11:56 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:44:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65.80326064012668,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.85448758325757,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:11:56 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:44:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 257630.41754999995,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1873.8067115000376,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:26:55 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:38:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 244192.06504996668,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 93.4497275599612,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:26:55 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:38:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 244848.77846500013,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.6132789999283,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:26:55 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:38:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 70.32903447195388,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.815606907763367,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:26:55 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:38:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 67.0383085146902,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.124262533905293,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:26:55 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:38:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3678.4869454995714,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18127.93551750019,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:31:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:17:34 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 143.53516155332423,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 227.99471685199507,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:31:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:17:34 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 93.8544999989972,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 182.33619399961754,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:31:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:17:34 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.619680555833128,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 149.28410064460172,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:31:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:17:34 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 23.842106511884577,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 153.7595891936329,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:31:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:17:34 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
@@ -89452,668 +89452,668 @@ window.BENCHMARK_DATA = {
           "timestamp": "2024-05-31T14:44:40Z",
           "url": "https://github.com/neuralmagic/nm-vllm/commit/019467500eee9542833aedd6e2c17bd62ffa4b40"
         },
-        "date": 1717226920680,
+        "date": 1717399193605,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5153.267233499719,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2055.8448110000427,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:59:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:51:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 93.72724149327648,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 83.56332087997241,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:59:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:51:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65.75649099886505,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.486727499934204,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:59:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:51:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.56325137920866,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.734279113870905,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:59:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:51:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 31.135406988289148,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.832082933123225,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:59:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:51:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6411.88908099997,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 255519.67455000023,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:23:50 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:18:51 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 110.83134152666541,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 241553.6490950207,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:23:50 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:18:51 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.34774449998804,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 242416.74192250002,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:23:50 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:18:51 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.46555021775157,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69.40638422233403,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:23:50 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:18:51 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37.16765750887564,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.7109178209585,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:23:50 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:18:51 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5465.795638499003,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6064.189869000074,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:38:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:55:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 182.45962854265963,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 121.64453962000455,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:38:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:55:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 171.95940000055998,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 83.92926149986124,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:38:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:55:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.97504323687587,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.951733724818304,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:38:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:55:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38.226725419496674,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.57946328211994,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:38:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:55:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80529.45460850015,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 3678.7117545,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:11:13 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:21:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 60326.79254720133,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 142.70120577671452,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:11:13 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:21:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 73896.06737299982,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 94.0109705006762,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:11:13 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:21:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.78470235412063,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.657908070816337,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:11:13 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:21:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65.91019339517356,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.014463232733515,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:11:13 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:21:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6095.397781999964,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18655.240721000155,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:30:18 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 05:05:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 125.82052522332332,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1362.2770440846543,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:30:18 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 05:05:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 85.79779149999922,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 262.8703115005919,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:30:18 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 05:05:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.91476485872919,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 157.27636172303048,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:30:18 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 05:05:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.6967848006478,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 145.06888759254596,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:30:18 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 05:05:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 259042.47589599958,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2627.0912965001116,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:26:14 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:04:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 244226.45942004398,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 121.82107498667513,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:26:14 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:04:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 245914.3064604996,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 85.28235999983735,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:26:14 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:04:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69.77792115304503,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19.810647724409144,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:26:14 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:04:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 67.17802914457835,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.80232702724121,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:26:14 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:04:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1956.863275499927,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6131.326911000315,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:32:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:48:13 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 79.70074537330827,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 104.16482551332592,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:32:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:48:13 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.54534800050169,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65.90202300003511,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:32:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:48:13 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.222250544463073,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34.78329706453252,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:32:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:48:13 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.288621271329562,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.25952119505826,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:32:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:48:13 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3676.1369844998626,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69814.66886199996,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:30:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:40:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 144.38824281663983,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31225.528803002006,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:30:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:40:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 95.10873100043682,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 30391.95524150091,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:30:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:40:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.436373215296747,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 191.78759748293524,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:30:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:40:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 23.844668904210636,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 202.26347417634994,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:30:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:40:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 7972.256407499913,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7891.291946000024,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:37:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:30:15 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 156.6752595119912,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 156.65347058000043,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:37:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:30:15 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 130.30707749999237,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 132.4609829999872,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:37:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:30:15 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.938903093863736,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.602982839799644,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:37:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:30:15 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.552133147019305,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.20217650230741,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:37:07 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:30:15 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 70102.49670000031,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6055.366905000483,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:48:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:58:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 31426.950578070013,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 120.88969452534124,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:48:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:58:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 30875.1951510003,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 88.752284000293,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:48:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:58:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 192.71480064987958,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 54.74072664358886,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:48:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:58:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 203.45817038187403,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 47.74285928966727,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 04:48:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:58:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6055.798128000788,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1955.5083455002205,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:06:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:58:01 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 121.03516805731974,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 97.00293331333948,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:06:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:58:01 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 87.93307399992045,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 62.8722764995473,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:06:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:58:01 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 54.60481309716015,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.454777052584479,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:06:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:58:01 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 47.55947662492116,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.78043380833142,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:06:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:58:01 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2646.620501999678,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 15412.548571500338,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:10:40 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:45:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 121.66360963468833,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 210.16727801800093,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:10:40 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:45:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 85.93796700006351,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 163.89254700015954,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:10:40 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:45:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19.870622147981955,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 124.69744632406882,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:10:40 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:45:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.90668610635907,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 129.96581216853258,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:10:40 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:45:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6173.267183000235,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16840.66650349996,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:55:27 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:10:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 104.52148489999067,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 220.79072606800037,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:55:27 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:10:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.1482324996523,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 171.54757699972834,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:55:27 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:10:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34.879281108834356,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 136.65848210703913,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:55:27 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:10:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.40217023112008,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 143.03363637500365,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:55:27 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:10:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1963.2678400007535,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2459.1634369999156,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:04:22 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:38:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 96.80734718000774,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 116.2581605826866,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:04:22 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:38:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59.49763450007595,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 82.49792600008732,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:04:22 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:38:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.528203331464441,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18.410164495557872,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:04:22 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:38:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.879467540563104,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.566398486916665,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:04:22 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:38:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 15955.817038000077,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5129.752017000101,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:51:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:50:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 210.95193908932862,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 93.16210111339994,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:51:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:50:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 168.29410699983782,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.41876699945715,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:51:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:50:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 129.05553459825703,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.49815985398641,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:51:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:50:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 133.93621821245296,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31.062567575039832,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:51:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:50:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6079.774405500075,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5430.696962000184,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:02:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:30:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 123.21347402667318,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 182.9749076986797,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:02:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:30:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 85.48538349987211,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 173.23178100014047,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:02:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:30:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.91949216651743,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41.03292704847647,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:02:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:30:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.54900252614391,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38.00412580783481,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:02:34 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:30:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65760.63130699993,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6086.761909499955,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:45:40 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:23:26 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 42493.21184977733,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 127.48856736667524,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:45:40 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:23:26 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 42642.46320699988,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 87.618281999994,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:45:40 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:23:26 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 100.45283879670744,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.758815015615966,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:45:40 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:23:26 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 102.5090991386381,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.57903657881813,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 02:45:40 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:23:26 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19413.51446599947,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6387.866823999957,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:14:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:17:00 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1911.2615646006827,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 109.88967173333322,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:14:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:17:00 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 301.560390999839,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.6485330000437,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:14:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:17:00 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 160.18905535696663,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.42783907498824,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:14:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:17:00 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 148.03388925635815,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37.13229719876569,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-01 05:14:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:17:00 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2055.1594009998553,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 78795.04042050007,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:58:06 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:03:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 82.2215669666851,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59064.32769362,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:58:06 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:03:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 41.70422399965901,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 72272.24899349995,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:58:06 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:03:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.804952142759143,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 70.32638058805797,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:58:06 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:03:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.80095409422303,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.77161860426683,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:58:06 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:03:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2505.315351000263,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 64624.35653950001,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:44:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:38:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 116.81682827600282,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41730.368778000004,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:44:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:38:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 79.92226000033043,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:44:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41610.93636999999,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:38:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18.642578608419566,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 100.42305277026975,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:44:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:38:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.85448758325757,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 102.35526501744602,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:44:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:38:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1873.8067115000376,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1959.06081000021,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:38:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:26:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 93.4497275599612,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.16835068669025,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:38:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:26:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.6132789999283,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.641754999593104,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:38:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:26:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.815606907763367,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.225090784758686,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:38:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:26:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.124262533905293,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.26305151808329,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 03:38:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:26:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18127.93551750019,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1855.636292500094,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:17:34 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:32:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 227.99471685199507,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 93.84331502001562,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:17:34 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:32:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 182.33619399961754,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 60.36902550022205,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:17:34 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:32:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 149.28410064460172,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.833273623129934,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:17:34 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:32:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 153.7595891936329,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.143455619047463,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-01 04:17:34 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:32:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
@@ -90134,668 +90134,668 @@ window.BENCHMARK_DATA = {
           "timestamp": "2024-05-31T14:44:40Z",
           "url": "https://github.com/neuralmagic/nm-vllm/commit/019467500eee9542833aedd6e2c17bd62ffa4b40"
         },
-        "date": 1717399193605,
+        "date": 1717399231380,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2055.8448110000427,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 64884.99480949997,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:51:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:39:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 83.56332087997241,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 42128.04296355933,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:51:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:39:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.486727499934204,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41348.1672575,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:51:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:39:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.734279113870905,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 101.04445164856457,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:51:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:39:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.832082933123225,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 103.1711670945071,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:51:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:39:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 255519.67455000023,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1872.9667494999376,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:18:51 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:32:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 241553.6490950207,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 93.80009647333888,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:18:51 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:32:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 242416.74192250002,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 61.514852499840345,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:18:51 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:32:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69.40638422233403,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.896918944598946,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:18:51 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:32:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.7109178209585,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.185281566078396,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:18:51 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:32:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6064.189869000074,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16066.287050499795,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:55:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:45:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 121.64453962000455,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 214.53785313599292,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:55:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:45:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 83.92926149986124,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 172.64887800001816,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:55:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:45:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.951733724818304,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 131.22571279814014,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:55:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:45:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.57946328211994,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 136.22333940196228,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:55:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:45:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3678.7117545,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2044.9386479999703,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:21:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:51:49 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 142.70120577671452,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 82.02566830661453,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:21:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:51:49 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 94.0109705006762,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.38501999982691,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:21:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:51:49 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.657908070816337,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.696571175341772,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:21:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:51:49 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.014463232733515,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.787949541710848,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:21:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:51:49 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18655.240721000155,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 3678.8398609996875,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 05:05:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:21:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1362.2770440846543,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 143.34025776999928,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 05:05:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:21:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 262.8703115005919,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 93.8414980005291,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 05:05:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:21:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 157.27636172303048,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.572533695436558,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 05:05:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:21:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 145.06888759254596,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 23.823086931210003,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 05:05:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:21:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2627.0912965001116,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19371.059033500387,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:04:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 05:06:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 121.82107498667513,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1885.296960885332,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:04:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 05:06:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 85.28235999983735,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 294.679543500024,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:04:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 05:06:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19.810647724409144,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 159.6807658872163,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:04:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 05:06:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.80232702724121,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 147.29431613646037,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:04:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 05:06:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6131.326911000315,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 78392.3951145,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:48:13 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:04:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 104.16482551332592,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58729.79366595998,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:48:13 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:04:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65.90202300003511,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 71717.63896000016,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:48:13 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:04:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34.78329706453252,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.47657554120445,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:48:13 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:04:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.25952119505826,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65.3877575028016,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:48:13 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:04:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69814.66886199996,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17474.08793500017,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:40:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:11:15 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 31225.528803002006,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 228.71194765666405,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:40:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:11:15 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 30391.95524150091,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 183.9115324996783,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:40:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:11:15 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 191.78759748293524,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 145.1916789198398,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:40:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:11:15 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 202.26347417634994,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 151.33294772111572,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:40:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:11:15 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 7891.291946000024,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6135.173119999763,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:30:15 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:49:00 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 156.65347058000043,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 105.64072813999398,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:30:15 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:49:00 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 132.4609829999872,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.32800649999626,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:30:15 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:49:00 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.602982839799644,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34.79748169149696,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:30:15 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:49:00 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.20217650230741,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.28670736210095,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:30:15 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:49:00 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6055.366905000483,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5498.607529500077,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:58:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:30:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 120.88969452534124,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 182.54350937465642,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:58:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:30:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 88.752284000293,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 171.12448450006923,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:58:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:30:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 54.74072664358886,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.94940404538018,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:58:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:30:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 47.74285928966727,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38.06768571114641,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:58:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:30:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1955.5083455002205,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5138.209700498919,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:58:01 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:51:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 97.00293331333948,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 93.15705355996519,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:58:01 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:51:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 62.8722764995473,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.36117849939183,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:58:01 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:51:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.454777052584479,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.58822536454887,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:58:01 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:51:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.78043380833142,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31.174410694362553,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:58:01 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:51:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 15412.548571500338,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1954.9747514997762,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:45:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:58:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 210.16727801800093,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 96.62923728671558,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:45:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:58:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 163.89254700015954,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 60.56203150001238,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:45:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:58:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 124.69744632406882,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.507764798042727,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:45:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:58:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 129.96581216853258,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.798509336531822,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:45:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:58:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16840.66650349996,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6139.169226500144,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:10:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:56:06 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 220.79072606800037,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 122.7274202033368,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:10:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:56:06 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 171.54757699972834,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 83.7162074999469,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:10:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:56:06 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 136.65848210703913,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.97776774062294,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:10:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:56:06 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 143.03363637500365,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.54337463571262,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:10:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:56:06 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2459.1634369999156,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2492.9172754996216,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:38:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:38:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 116.2581605826866,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 117.6331017546569,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:38:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:38:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 82.49792600008732,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 81.75818999961848,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:38:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:38:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18.410164495557872,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18.530413646906446,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:38:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:38:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.566398486916665,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.677904326030006,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:38:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:38:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5129.752017000101,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6414.464983000016,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:50:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:17:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 93.16210111339994,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 112.29079833333799,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:50:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:17:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.41876699945715,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 72.604216000002,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:50:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:17:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.49815985398641,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.471566068882154,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:50:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:17:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 31.062567575039832,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37.185656056709675,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:50:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:17:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5430.696962000184,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1966.2476574999346,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:30:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:26:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 182.9749076986797,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.28211399331971,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:30:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:26:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 173.23178100014047,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.71289049968618,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:30:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:26:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 41.03292704847647,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.294746986263503,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:30:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:26:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38.00412580783481,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.34823025852795,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:30:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:26:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6086.761909499955,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6110.828845000469,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:23:26 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:58:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 127.48856736667524,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 121.98278304797957,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:23:26 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:58:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 87.618281999994,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 91.8971290002446,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:23:26 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:58:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.758815015615966,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 55.213019199453505,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:23:26 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:58:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.57903657881813,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 48.04679258427369,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:23:26 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:58:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6387.866823999957,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 256072.86408100027,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:17:00 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:19:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 109.88967173333322,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 242264.19104754063,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:17:00 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:19:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.6485330000437,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 243567.29364700004,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:17:00 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:19:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.42783907498824,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 71.00752805688872,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:17:00 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:19:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37.13229719876569,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.89142780360217,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:17:00 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:19:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 78795.04042050007,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6088.1548394999645,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:03:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:24:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59064.32769362,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 128.75784957333812,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:03:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:24:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 72272.24899349995,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 86.68618349997814,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:03:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:24:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 70.32638058805797,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.65933069680194,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:03:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:24:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.77161860426683,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.60653004686339,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:03:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:24:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 64624.35653950001,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2650.027239999872,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:38:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:04:24 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 41730.368778000004,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 121.68155883867075,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:38:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:04:24 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 41610.93636999999,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 85.72223100009069,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:38:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:04:24 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 100.42305277026975,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 20.00516919578365,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:38:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:04:24 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 102.35526501744602,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.887225188275885,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:38:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:04:24 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1959.06081000021,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 71232.04859200087,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:26:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:40:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.16835068669025,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 32204.601403142005,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:26:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:40:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.641754999593104,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31570.015363499806,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:26:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:40:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.225090784758686,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 192.88114407327933,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:26:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:40:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.26305151808329,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 204.25227204735702,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:26:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:40:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1855.636292500094,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7951.652094499991,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:32:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:31:12 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 93.84331502001562,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 157.5392373279974,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:32:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:31:12 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 60.36902550022205,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 131.66923750009119,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:32:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:31:12 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.833273623129934,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.64349126188311,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:32:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:31:12 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.143455619047463,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.282586737329886,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:32:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:31:12 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
@@ -90816,2032 +90816,2032 @@ window.BENCHMARK_DATA = {
           "timestamp": "2024-05-31T14:44:40Z",
           "url": "https://github.com/neuralmagic/nm-vllm/commit/019467500eee9542833aedd6e2c17bd62ffa4b40"
         },
-        "date": 1717399231380,
+        "date": 1717399352850,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 64884.99480949997,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65653.67689900006,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:39:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:39:39 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 42128.04296355933,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 42922.725588200665,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:39:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:39:39 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 41348.1672575,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 42452.05514099996,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:39:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:39:39 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 101.04445164856457,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 100.64114655299583,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:39:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:39:39 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 103.1711670945071,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 103.02451028525023,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:39:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:39:39 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1872.9667494999376,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18028.672680499767,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:32:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:11:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 93.80009647333888,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 228.00184653931015,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:32:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:11:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 61.514852499840345,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 184.5591454998612,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:32:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:11:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.896918944598946,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 149.30636013642217,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:32:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:11:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.185281566078396,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 154.20055413725726,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:32:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:11:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16066.287050499795,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16051.454757500323,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:45:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:45:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 214.53785313599292,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 212.19625483396766,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:45:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:45:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 172.64887800001816,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 165.96760799984622,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:45:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:45:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 131.22571279814014,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 129.73121629907203,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:45:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:45:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 136.22333940196228,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 135.53129105893592,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:45:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:45:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2044.9386479999703,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1950.291978500445,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:51:49 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:58:09 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 82.02566830661453,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 96.71114405334265,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:51:49 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:58:09 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.38501999982691,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.57355600028313,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:51:49 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:58:09 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.696571175341772,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.472545235675167,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:51:49 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:58:09 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.787949541710848,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.795277081505917,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:51:49 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:58:09 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3678.8398609996875,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5484.436976999859,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:21:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:32:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 143.34025776999928,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 183.11239930000252,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:21:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:32:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 93.8414980005291,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 174.24708299949998,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:21:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:32:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.572533695436558,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41.14093455176976,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:21:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:32:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 23.823086931210003,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38.30365630337018,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:21:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:32:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19371.059033500387,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2635.7566330002555,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 05:06:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:04:14 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1885.296960885332,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 122.5081048280105,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 05:06:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:04:14 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 294.679543500024,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 86.22319499954756,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 05:06:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:04:14 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 159.6807658872163,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19.82901910338906,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 05:06:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:04:14 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 147.29431613646037,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.959692317755575,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 05:06:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:04:14 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 78392.3951145,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6082.61340899935,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:04:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 05:00:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58729.79366595998,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 120.66617888800829,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:04:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 05:00:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 71717.63896000016,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 91.32013650014414,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:04:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 05:00:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.47657554120445,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 54.92637214879297,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:04:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 05:00:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65.3877575028016,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 47.96357205752251,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:04:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 05:00:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17474.08793500017,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 259887.42452949987,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:11:15 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:19:39 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 228.71194765666405,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 244297.01181745736,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:11:15 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:19:39 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 183.9115324996783,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 246612.01731449977,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:11:15 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:19:39 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 145.1916789198398,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69.85346358159815,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:11:15 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:19:39 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 151.33294772111572,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67.03277572366092,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:11:15 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:19:39 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6135.173119999763,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2059.9316304997046,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:49:00 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:51:53 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 105.64072813999398,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 81.80629287329187,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:49:00 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:51:53 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.32800649999626,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.89324199983457,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:49:00 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:51:53 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34.79748169149696,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.75254192367904,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:49:00 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:51:53 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.28670736210095,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.818274649825822,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:49:00 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:51:53 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5498.607529500077,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6147.821475499768,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:30:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:49:09 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 182.54350937465642,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 106.80471219998556,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:30:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:49:09 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 171.12448450006923,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69.50420700013638,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:30:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:49:09 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.94940404538018,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34.79804842406723,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:30:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:49:09 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38.06768571114641,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.293424371762335,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:30:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:49:09 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5138.209700498919,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5145.081876000404,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:51:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:53:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 93.15705355996519,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 90.81483949998074,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:51:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:53:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.36117849939183,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65.30737250068341,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:51:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:53:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.58822536454887,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.537024166646766,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:51:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:53:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 31.174410694362553,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31.121061388982348,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:51:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:53:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1954.9747514997762,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2503.5247364999123,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:58:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:38:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 96.62923728671558,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 116.9130924946609,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:58:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:38:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 60.56203150001238,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 84.03955550011233,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:58:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:38:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.507764798042727,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18.600041352456742,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:58:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:38:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.798509336531822,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.867856167450782,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:58:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:38:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6139.169226500144,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1855.5622699996093,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:56:06 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:32:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 122.7274202033368,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 93.49166844330588,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:56:06 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:32:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 83.7162074999469,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 60.94448049998391,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:56:06 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:32:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.97776774062294,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.805301836104457,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:56:06 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:32:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.54337463571262,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.068215507272877,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:56:06 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:32:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2492.9172754996216,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19543.508186999134,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:38:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 05:08:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 117.6331017546569,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1963.6742602433683,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:38:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 05:08:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 81.75818999961848,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 309.27946499923564,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:38:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 05:08:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18.530413646906446,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 159.62565076317185,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:38:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 05:08:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.677904326030006,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 147.1995780301561,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:38:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 05:08:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6414.464983000016,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 3680.0972384999113,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:17:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:24:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 112.29079833333799,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 143.19777263336314,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:17:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:24:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 72.604216000002,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 91.91298149926297,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:17:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:24:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.471566068882154,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.69263322334179,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:17:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:24:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37.185656056709675,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 23.913538028351816,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:17:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:24:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1966.2476574999346,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6405.955829499988,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:26:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:17:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.28211399331971,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 111.1068306533328,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:26:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:17:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.71289049968618,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 70.63821700006656,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:26:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:17:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.294746986263503,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.43517654688617,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:26:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:17:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.34823025852795,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37.149975453829946,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:26:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:17:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6110.828845000469,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1969.1396335001627,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:58:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:26:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 121.98278304797957,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 79.93283122000018,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:58:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:26:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 91.8971290002446,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38.59165750009197,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:58:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:26:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 55.213019199453505,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.23605530753715,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:58:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:26:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 48.04679258427369,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.304046444367396,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:58:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:26:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 256072.86408100027,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 70070.83320249966,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:19:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:42:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 242264.19104754063,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31352.185162739992,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:19:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:42:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 243567.29364700004,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 30375.764695000726,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:19:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:42:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 71.00752805688872,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 193.05356530653393,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:19:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:42:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.89142780360217,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 203.46995719542804,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:19:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:42:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6088.1548394999645,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6088.025100000095,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:24:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:56:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 128.75784957333812,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 123.1539829,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:24:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:56:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 86.68618349997814,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.05334449990187,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:24:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:56:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.65933069680194,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.04965201487477,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:24:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:56:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.60653004686339,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.64856071417043,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:24:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:56:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2650.027239999872,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7964.07807549997,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:04:24 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:31:04 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 121.68155883867075,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 157.12438900266352,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:04:24 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:31:04 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 85.72223100009069,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 132.33693000006497,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:04:24 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:31:04 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 20.00516919578365,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.697434615094835,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:04:24 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:31:04 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.887225188275885,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.35603009244823,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:04:24 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:31:04 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 71232.04859200087,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6083.727579999959,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:40:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:24:15 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 32204.601403142005,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 127.27163534000054,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:40:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:24:15 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 31570.015363499806,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 85.52642900008323,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:40:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:24:15 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 192.88114407327933,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.777392715063506,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:40:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:24:15 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 204.25227204735702,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.60346729963552,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:40:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:24:15 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 7951.652094499991,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 78119.36454800003,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:31:12 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:04:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 157.5392373279974,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58545.294935608,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:31:12 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:04:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 131.66923750009119,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 71478.12105899994,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:31:12 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:04:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.64349126188311,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.41038133516298,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:31:12 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:04:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.282586737329886,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65.31888471951564,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:31:12 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:04:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
       {
         "commit": {
           "author": {
-            "name": "Domenic Barbuzzi",
-            "username": "dbarbuzzi",
-            "email": "dbarbuzzi@gmail.com"
+            "name": "Robert Shaw",
+            "username": "robertgshaw2-neuralmagic",
+            "email": "114415538+robertgshaw2-neuralmagic@users.noreply.github.com"
           },
           "committer": {
             "name": "GitHub",
             "username": "web-flow",
             "email": "noreply@github.com"
           },
-          "id": "019467500eee9542833aedd6e2c17bd62ffa4b40",
-          "message": "Handle server startup failure in __enter__ (#274)\n\nWith a context manager class, the `__exit__` method is not called when\r\nan exception is raised during the context manager’s `__enter__` method.\r\nThis PR addresses that by manually calling that method if an exception\r\nis raised.",
-          "timestamp": "2024-05-31T14:44:40Z",
-          "url": "https://github.com/neuralmagic/nm-vllm/commit/019467500eee9542833aedd6e2c17bd62ffa4b40"
+          "id": "fec35636296a2dfad0880699e061918f64a3b9d6",
+          "message": "Upstream sync 2024 05 19 (#249)\n\nUpstream sync 2024 05 25 (#249)\r\n\r\nSUMMARY:\r\nMerge commits from\r\nhttps://github.com/vllm-project/vllm/commit/c7f2cf2b7f67bce5842fedfdba508440fe257375\r\nto\r\nhttps://github.com/vllm-project/vllm/commit/f68470e803df575f294e67167b4b83adfe004cfa\r\n\r\nNote that\r\nhttps://github.com/vllm-project/vllm/commit/c7f2cf2b7f67bce5842fedfdba508440fe257375\r\nis NOT included in this merge.\r\n\r\n---\r\n\r\n<details>\r\n<!-- inside this <details> section, markdown rendering does not work, so\r\nwe use raw html here. -->\r\n<summary><b> PR Checklist (Click to Expand) </b></summary>\r\n\r\n<p>Thank you for your contribution to vLLM! Before submitting the pull\r\nrequest, please ensure the PR meets the following criteria. This helps\r\nvLLM maintain the code quality and improve the efficiency of the review\r\nprocess.</p>\r\n\r\n<h3>PR Title and Classification</h3>\r\n<p>Only specific types of PRs will be reviewed. The PR title is prefixed\r\nappropriately to indicate the type of change. Please use one of the\r\nfollowing:</p>\r\n<ul>\r\n    <li><code>[Bugfix]</code> for bug fixes.</li>\r\n<li><code>[CI/Build]</code> for build or continuous integration\r\nimprovements.</li>\r\n<li><code>[Doc]</code> for documentation fixes and improvements.</li>\r\n<li><code>[Model]</code> for adding a new model or improving an existing\r\nmodel. Model name should appear in the title.</li>\r\n<li><code>[Frontend]</code> For changes on the vLLM frontend (e.g.,\r\nOpenAI API server, <code>LLM</code> class, etc.) </li>\r\n<li><code>[Kernel]</code> for changes affecting CUDA kernels or other\r\ncompute kernels.</li>\r\n<li><code>[Core]</code> for changes in the core vLLM logic (e.g.,\r\n<code>LLMEngine</code>, <code>AsyncLLMEngine</code>,\r\n<code>Scheduler</code>, etc.)</li>\r\n<li><code>[Hardware][Vendor]</code> for hardware-specific changes.\r\nVendor name should appear in the prefix (e.g.,\r\n<code>[Hardware][AMD]</code>).</li>\r\n<li><code>[Misc]</code> for PRs that do not fit the above categories.\r\nPlease use this sparingly.</li>\r\n</ul>\r\n<p><strong>Note:</strong> If the PR spans more than one category, please\r\ninclude all relevant prefixes.</p>\r\n\r\n<h3>Code Quality</h3>\r\n\r\n<p>The PR need to meet the following code quality standards:</p>\r\n\r\n<ul>\r\n<li>We adhere to <a\r\nhref=\"https://google.github.io/styleguide/pyguide.html\">Google Python\r\nstyle guide</a> and <a\r\nhref=\"https://google.github.io/styleguide/cppguide.html\">Google C++\r\nstyle guide</a>.</li>\r\n<li>Pass all linter checks. Please use <a\r\nhref=\"https://github.com/vllm-project/vllm/blob/main/format.sh\"><code>format.sh</code></a>\r\nto format your code.</li>\r\n<li>The code need to be well-documented to ensure future contributors\r\ncan easily understand the code.</li>\r\n<li>Include sufficient tests to ensure the project to stay correct and\r\nrobust. This includes both unit tests and integration tests.</li>\r\n<li>Please add documentation to <code>docs/source/</code> if the PR\r\nmodifies the user-facing behaviors of vLLM. It helps vLLM user\r\nunderstand and utilize the new features or changes.</li>\r\n</ul>\r\n\r\n<h3>Notes for Large Changes</h3>\r\n<p>Please keep the changes as concise as possible. For major\r\narchitectural changes (>500 LOC excluding kernel/data/config/test), we\r\nwould expect a GitHub issue (RFC) discussing the technical design and\r\njustification. Otherwise, we will tag it with <code>rfc-required</code>\r\nand might not go through the PR.</p>\r\n\r\n<h3>What to Expect for the Reviews</h3>\r\n\r\n<p>The goal of the vLLM team is to be a <i>transparent reviewing\r\nmachine</i>. We would like to make the review process transparent and\r\nefficient and make sure no contributor feel confused or frustrated.\r\nHowever, the vLLM team is small, so we need to prioritize some PRs over\r\nothers. Here is what you can expect from the review process: </p>\r\n\r\n<ul>\r\n<li> After the PR is submitted, the PR will be assigned to a reviewer.\r\nEvery reviewer will pick up the PRs based on their expertise and\r\navailability.</li>\r\n<li> After the PR is assigned, the reviewer will provide status update\r\nevery 2-3 days. If the PR is not reviewed within 7 days, please feel\r\nfree to ping the reviewer or the vLLM team.</li>\r\n<li> After the review, the reviewer will put an <code>\r\naction-required</code> label on the PR if there are changes required.\r\nThe contributor should address the comments and ping the reviewer to\r\nre-review the PR.</li>\r\n<li> Please respond to all comments within a reasonable time frame. If a\r\ncomment isn't clear or you disagree with a suggestion, feel free to ask\r\nfor clarification or discuss the suggestion.\r\n </li>\r\n</ul>\r\n\r\n<h3>Thank You</h3>\r\n\r\n<p> Finally, thank you for taking the time to read these guidelines and\r\nfor your interest in contributing to vLLM. Your contributions make vLLM\r\na great tool for everyone! </p>\r\n\r\n\r\n</details>\r\n\r\n---------\r\n\r\nSigned-off-by: kerthcet <kerthcet@gmail.com>\r\nCo-authored-by: zhaoyang-star <zhaoyangstar@foxmail.com>\r\nCo-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>\r\nCo-authored-by: Simon Mo <simon.mo@hey.com>\r\nCo-authored-by: Cade Daniel <edacih@gmail.com>\r\nCo-authored-by: Noam Gat <noamgat@gmail.com>\r\nCo-authored-by: Philipp Moritz <pcmoritz@gmail.com>\r\nCo-authored-by: youkaichao <youkaichao@gmail.com>\r\nCo-authored-by: Alexei-V-Ivanov-AMD <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com>\r\nCo-authored-by: Austin Veselka <50646302+FurtherAI@users.noreply.github.com>\r\nCo-authored-by: leiwen83 <leiwen83@users.noreply.github.com>\r\nCo-authored-by: Lei Wen <wenlei03@qiyi.com>\r\nCo-authored-by: Cody Yu <hao.yu.cody@gmail.com>\r\nCo-authored-by: SangBin Cho <rkooo567@gmail.com>\r\nCo-authored-by: DefTruth <31974251+DefTruth@users.noreply.github.com>\r\nCo-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>\r\nCo-authored-by: Antoni Baum <antoni.baum@protonmail.com>\r\nCo-authored-by: alexm-nm <59768536+alexm-nm@users.noreply.github.com>\r\nCo-authored-by: Mahmoud Ashraf <hassouna97.ma@gmail.com>\r\nCo-authored-by: Michael Goin <michael@neuralmagic.com>\r\nCo-authored-by: kliuae <17350011+kliuae@users.noreply.github.com>\r\nCo-authored-by: miloice <jeffaw99@hotmail.com>\r\nCo-authored-by: Hao Zhang <152229491+sfc-gh-hazhang@users.noreply.github.com>\r\nCo-authored-by: Dash Desai <1723932+iamontheinet@users.noreply.github.com>\r\nCo-authored-by: Aurick Qiao <qiao@aurick.net>\r\nCo-authored-by: Aurick Qiao <aurick.qiao@snowflake.com>\r\nCo-authored-by: Aurick Qiao <aurickq@users.noreply.github.com>\r\nCo-authored-by: Allen.Dou <allen.dou@hotmail.com>\r\nCo-authored-by: Kunshang Ji <kunshang.ji@intel.com>\r\nCo-authored-by: Steve Grubb <ausearch.1@gmail.com>\r\nCo-authored-by: heeju-kim2 <157340754+heeju-kim2@users.noreply.github.com>\r\nCo-authored-by: Chang Su <chang.s.su@oracle.com>\r\nCo-authored-by: Yikang Shen <yikang.shn@gmail.com>\r\nCo-authored-by: Swapnil Parekh <swapnilbp100@gmail.com>\r\nCo-authored-by: Sanger Steel <sangersteel@gmail.com>\r\nCo-authored-by: Stephen Krider <72541272+skrider@users.noreply.github.com>\r\nCo-authored-by: LiuXiaoxuanPKU <lilyliupku@gmail.com>\r\nCo-authored-by: Zhuohan Li <zhuohan123@gmail.com>\r\nCo-authored-by: Kuntai Du <kuntai@uchicago.edu>\r\nCo-authored-by: Nick Hill <nickhill@us.ibm.com>\r\nCo-authored-by: SAHIL SUNEJA <suneja@us.ibm.com>\r\nCo-authored-by: zifeitong <zifeitong@gmail.com>\r\nCo-authored-by: Alex Wu <itswu.alex@gmail.com>\r\nCo-authored-by: Cade Daniel <cade@anyscale.com>\r\nCo-authored-by: Jinzhen Lin <linjinzhen@hotmail.com>\r\nCo-authored-by: Alex Wu <alexanderwu@berkeley.edu>\r\nCo-authored-by: Pierre Dulac <dulacpier@gmail.com>\r\nCo-authored-by: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com>\r\nCo-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>\r\nCo-authored-by: Silencio <19430328+Silencioo@users.noreply.github.com>\r\nCo-authored-by: Silencio <silencio@adsl-99-6-187-6.dsl.irvnca.sbcglobal.net>\r\nCo-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>\r\nCo-authored-by: Kante Yin <kerthcet@gmail.com>\r\nCo-authored-by: bofeng huang <bofenghuang7@gmail.com>\r\nCo-authored-by: eigenLiu <33959526+eigen2017@users.noreply.github.com>\r\nCo-authored-by: alexeykondrat <143633163+alexeykondrat@users.noreply.github.com>\r\nCo-authored-by: Ubuntu <ubuntu@ip-10-10-40-24.ec2.internal>\r\nCo-authored-by: Domenic Barbuzzi <domenic@neuralmagic.com>",
+          "timestamp": "2024-06-03T15:22:54Z",
+          "url": "https://github.com/neuralmagic/nm-vllm/commit/fec35636296a2dfad0880699e061918f64a3b9d6"
         },
-        "date": 1717399352850,
+        "date": 1717436914272,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65653.67689900006,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1826.7876810000416,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:39:39 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:03:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 42922.725588200665,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 92.30180574333039,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:39:39 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:03:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 42452.05514099996,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 57.09178000006432,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:39:39 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:03:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 100.64114655299583,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.494657948635467,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:39:39 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:03:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 103.02451028525023,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.822835040136072,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:39:39 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:03:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18028.672680499767,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2577.4123235000843,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:11:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:35:17 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 228.00184653931015,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 120.01311407067139,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:11:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:35:17 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 184.5591454998612,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 83.47934300036286,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:11:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:35:17 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 149.30636013642217,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18.917851642837732,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:11:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:35:17 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 154.20055413725726,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.005374696378382,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:11:08 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:35:17 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16051.454757500323,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7477.170930999932,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:45:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:03:03 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 212.19625483396766,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 155.60422404133442,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:45:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:03:03 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 165.96760799984622,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 126.69303699999546,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:45:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:03:03 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 129.73121629907203,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 56.154255656328566,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:45:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:03:03 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 135.53129105893592,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 55.70570047176915,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:45:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:03:03 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1950.291978500445,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5792.872616500063,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:58:09 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:30:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 96.71114405334265,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 119.60485665071124,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:58:09 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:30:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.57355600028313,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 88.6595430001762,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:58:09 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:30:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.472545235675167,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 52.57606963795908,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:58:09 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:30:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.795277081505917,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 45.77184932528473,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:58:09 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:30:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5484.436976999859,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6359.727259500005,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:32:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 12:49:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 183.11239930000252,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 111.62061250000306,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:32:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 12:49:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 174.24708299949998,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 71.44836100002294,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:32:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 12:49:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 41.14093455176976,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.14834417896245,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:32:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 12:49:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38.30365630337018,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.866016742486394,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:32:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 12:49:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2635.7566330002555,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2027.7279659999294,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:04:14 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:23:01 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 122.5081048280105,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 81.131375346716,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:04:14 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:23:01 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 86.22319499954756,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38.80655000011757,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:04:14 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:23:01 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19.82901910338906,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.574741119740576,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:04:14 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:23:01 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.959692317755575,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.62783674047078,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 04:04:14 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:23:01 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6082.61340899935,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5086.0681250005655,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 05:00:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:23:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 120.66617888800829,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 90.60802515331791,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 05:00:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:23:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 91.32013650014414,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 63.84449399956793,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 05:00:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:23:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 54.92637214879297,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.191637028120056,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 05:00:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:23:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 47.96357205752251,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 30.83664939242367,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 05:00:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:23:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 259887.42452949987,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13438.462082000115,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:19:39 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:41:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 244297.01181745736,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 207.69161295667195,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:19:39 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:41:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 246612.01731449977,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 162.68896749988926,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:19:39 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:41:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69.85346358159815,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 104.1323195408694,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:19:39 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:41:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 67.03277572366092,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 107.03430961238783,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:19:39 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:41:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2059.9316304997046,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2410.390771999573,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:51:53 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:09:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 81.80629287329187,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 116.01512438667001,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:51:53 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:09:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.89324199983457,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.54676449955878,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:51:53 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:09:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.75254192367904,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.631069397802065,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:51:53 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:09:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.818274649825822,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.065749860137487,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:51:53 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:09:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6147.821475499768,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58115.24761200053,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:49:09 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:12:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 106.80471219998556,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24693.190178945297,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:49:09 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:12:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69.50420700013638,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 22696.127297999737,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:49:09 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:12:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34.79804842406723,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 182.07596480055477,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:49:09 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:12:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.293424371762335,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 192.91163983201352,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:49:09 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:12:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5145.081876000404,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 3638.090063499476,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:53:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 14:54:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 90.81483949998074,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 141.89584527663706,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:53:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 14:54:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65.30737250068341,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 92.82084300048155,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:53:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 14:54:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.537024166646766,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.30408247226253,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:53:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 14:54:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 31.121061388982348,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 23.61374608368908,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:53:20 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 14:54:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2503.5247364999123,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1942.1043275001466,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:38:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:57:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 116.9130924946609,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.1003962000262,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:38:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:57:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 84.03955550011233,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38.76818800017645,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:38:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:57:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18.600041352456742,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.053056741675453,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:38:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:57:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.867856167450782,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.178593173806847,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:38:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:57:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1855.5622699996093,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1919.03614299963,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:32:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:29:15 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 93.49166844330588,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 95.22951738337119,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:32:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:29:15 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 60.94448049998391,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59.753891499440215,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:32:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:29:15 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.805301836104457,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.171030743815644,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:32:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:29:15 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.068215507272877,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.444907479944426,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:32:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:29:15 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19543.508186999134,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6011.056845000212,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 05:08:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:27:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1963.6742602433683,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 119.51665396332828,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 05:08:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:27:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 309.27946499923564,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 76.40881049997006,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 05:08:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:27:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 159.62565076317185,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.447607174651544,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 05:08:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:27:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 147.1995780301561,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.04213773936851,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 05:08:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:27:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3680.0972384999113,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6010.300804999929,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:24:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 12:56:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 143.19777263336314,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 126.94023836000534,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:24:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 12:56:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 91.91298149926297,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 86.4834220000148,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:24:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 12:56:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.69263322334179,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.24737687956218,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:24:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 12:56:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 23.913538028351816,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.325393796500656,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:24:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 12:56:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6405.955829499988,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5231.110777500362,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:17:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:02:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 111.1068306533328,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 180.19965554800243,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:17:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:02:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 70.63821700006656,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 169.80007999973168,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:17:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:02:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.43517654688617,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38.8138107661964,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:17:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:02:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37.149975453829946,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.10993344656062,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:17:47 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:02:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1969.1396335001627,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6102.600778999886,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:26:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:20:47 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 79.93283122000018,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 105.56953884001207,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:26:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:20:47 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38.59165750009197,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.45767050031282,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:26:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:20:47 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.23605530753715,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34.50484903749174,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:26:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:20:47 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.304046444367396,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.01785497536404,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:26:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:20:47 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 70070.83320249966,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13024.152943000445,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:42:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:37:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 31352.185162739992,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 166.09906876733658,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:42:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:37:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 30375.764695000726,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 130.501374999767,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:42:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:37:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 193.05356530653393,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 116.45557543230827,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:42:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:37:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 203.46995719542804,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 104.93449016074172,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 04:42:04 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:37:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6088.025100000095,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 73612.5902235001,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:56:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:36:19 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 123.1539829,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 54508.354054376,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:56:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:36:19 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.05334449990187,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66708.06425700017,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:56:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:36:19 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.04965201487477,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67.84795469274688,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:56:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:36:19 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.64856071417043,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 64.49598703678805,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:56:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:36:19 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 7964.07807549997,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 57355.77882799987,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:31:04 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:11:21 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 157.12438900266352,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35571.797603253995,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:31:04 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:11:21 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 132.33693000006497,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35350.50049350002,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:31:04 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:11:21 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.697434615094835,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 96.58072383981926,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:31:04 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:11:21 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.35603009244823,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 98.55851647084337,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:31:04 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:11:21 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6083.727579999959,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 249174.2924559994,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:24:15 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:50:59 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 127.27163534000054,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 235009.30902119138,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:24:15 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:50:59 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 85.52642900008323,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 236050.11426999955,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:24:15 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:50:59 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.777392715063506,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.79708311294138,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:24:15 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:50:59 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.60346729963552,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65.43778767154,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 02:24:15 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:50:59 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 78119.36454800003,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11550.040795000314,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:04:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:16:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58545.294935608,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 191.73528062333148,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:04:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:16:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 71478.12105899994,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 148.74351900016336,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:04:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:16:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.41038133516298,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 87.77569215848233,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:04:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:16:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65.31888471951564,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 89.93932269039736,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 03:04:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:16:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
       {
         "commit": {
           "author": {
-            "name": "Robert Shaw",
-            "username": "robertgshaw2-neuralmagic",
-            "email": "114415538+robertgshaw2-neuralmagic@users.noreply.github.com"
+            "name": "dhuangnm",
+            "username": "dhuangnm",
+            "email": "74931910+dhuangnm@users.noreply.github.com"
           },
           "committer": {
             "name": "GitHub",
             "username": "web-flow",
             "email": "noreply@github.com"
           },
-          "id": "fec35636296a2dfad0880699e061918f64a3b9d6",
-          "message": "Upstream sync 2024 05 19 (#249)\n\nUpstream sync 2024 05 25 (#249)\r\n\r\nSUMMARY:\r\nMerge commits from\r\nhttps://github.com/vllm-project/vllm/commit/c7f2cf2b7f67bce5842fedfdba508440fe257375\r\nto\r\nhttps://github.com/vllm-project/vllm/commit/f68470e803df575f294e67167b4b83adfe004cfa\r\n\r\nNote that\r\nhttps://github.com/vllm-project/vllm/commit/c7f2cf2b7f67bce5842fedfdba508440fe257375\r\nis NOT included in this merge.\r\n\r\n---\r\n\r\n<details>\r\n<!-- inside this <details> section, markdown rendering does not work, so\r\nwe use raw html here. -->\r\n<summary><b> PR Checklist (Click to Expand) </b></summary>\r\n\r\n<p>Thank you for your contribution to vLLM! Before submitting the pull\r\nrequest, please ensure the PR meets the following criteria. This helps\r\nvLLM maintain the code quality and improve the efficiency of the review\r\nprocess.</p>\r\n\r\n<h3>PR Title and Classification</h3>\r\n<p>Only specific types of PRs will be reviewed. The PR title is prefixed\r\nappropriately to indicate the type of change. Please use one of the\r\nfollowing:</p>\r\n<ul>\r\n    <li><code>[Bugfix]</code> for bug fixes.</li>\r\n<li><code>[CI/Build]</code> for build or continuous integration\r\nimprovements.</li>\r\n<li><code>[Doc]</code> for documentation fixes and improvements.</li>\r\n<li><code>[Model]</code> for adding a new model or improving an existing\r\nmodel. Model name should appear in the title.</li>\r\n<li><code>[Frontend]</code> For changes on the vLLM frontend (e.g.,\r\nOpenAI API server, <code>LLM</code> class, etc.) </li>\r\n<li><code>[Kernel]</code> for changes affecting CUDA kernels or other\r\ncompute kernels.</li>\r\n<li><code>[Core]</code> for changes in the core vLLM logic (e.g.,\r\n<code>LLMEngine</code>, <code>AsyncLLMEngine</code>,\r\n<code>Scheduler</code>, etc.)</li>\r\n<li><code>[Hardware][Vendor]</code> for hardware-specific changes.\r\nVendor name should appear in the prefix (e.g.,\r\n<code>[Hardware][AMD]</code>).</li>\r\n<li><code>[Misc]</code> for PRs that do not fit the above categories.\r\nPlease use this sparingly.</li>\r\n</ul>\r\n<p><strong>Note:</strong> If the PR spans more than one category, please\r\ninclude all relevant prefixes.</p>\r\n\r\n<h3>Code Quality</h3>\r\n\r\n<p>The PR need to meet the following code quality standards:</p>\r\n\r\n<ul>\r\n<li>We adhere to <a\r\nhref=\"https://google.github.io/styleguide/pyguide.html\">Google Python\r\nstyle guide</a> and <a\r\nhref=\"https://google.github.io/styleguide/cppguide.html\">Google C++\r\nstyle guide</a>.</li>\r\n<li>Pass all linter checks. Please use <a\r\nhref=\"https://github.com/vllm-project/vllm/blob/main/format.sh\"><code>format.sh</code></a>\r\nto format your code.</li>\r\n<li>The code need to be well-documented to ensure future contributors\r\ncan easily understand the code.</li>\r\n<li>Include sufficient tests to ensure the project to stay correct and\r\nrobust. This includes both unit tests and integration tests.</li>\r\n<li>Please add documentation to <code>docs/source/</code> if the PR\r\nmodifies the user-facing behaviors of vLLM. It helps vLLM user\r\nunderstand and utilize the new features or changes.</li>\r\n</ul>\r\n\r\n<h3>Notes for Large Changes</h3>\r\n<p>Please keep the changes as concise as possible. For major\r\narchitectural changes (>500 LOC excluding kernel/data/config/test), we\r\nwould expect a GitHub issue (RFC) discussing the technical design and\r\njustification. Otherwise, we will tag it with <code>rfc-required</code>\r\nand might not go through the PR.</p>\r\n\r\n<h3>What to Expect for the Reviews</h3>\r\n\r\n<p>The goal of the vLLM team is to be a <i>transparent reviewing\r\nmachine</i>. We would like to make the review process transparent and\r\nefficient and make sure no contributor feel confused or frustrated.\r\nHowever, the vLLM team is small, so we need to prioritize some PRs over\r\nothers. Here is what you can expect from the review process: </p>\r\n\r\n<ul>\r\n<li> After the PR is submitted, the PR will be assigned to a reviewer.\r\nEvery reviewer will pick up the PRs based on their expertise and\r\navailability.</li>\r\n<li> After the PR is assigned, the reviewer will provide status update\r\nevery 2-3 days. If the PR is not reviewed within 7 days, please feel\r\nfree to ping the reviewer or the vLLM team.</li>\r\n<li> After the review, the reviewer will put an <code>\r\naction-required</code> label on the PR if there are changes required.\r\nThe contributor should address the comments and ping the reviewer to\r\nre-review the PR.</li>\r\n<li> Please respond to all comments within a reasonable time frame. If a\r\ncomment isn't clear or you disagree with a suggestion, feel free to ask\r\nfor clarification or discuss the suggestion.\r\n </li>\r\n</ul>\r\n\r\n<h3>Thank You</h3>\r\n\r\n<p> Finally, thank you for taking the time to read these guidelines and\r\nfor your interest in contributing to vLLM. Your contributions make vLLM\r\na great tool for everyone! </p>\r\n\r\n\r\n</details>\r\n\r\n---------\r\n\r\nSigned-off-by: kerthcet <kerthcet@gmail.com>\r\nCo-authored-by: zhaoyang-star <zhaoyangstar@foxmail.com>\r\nCo-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>\r\nCo-authored-by: Simon Mo <simon.mo@hey.com>\r\nCo-authored-by: Cade Daniel <edacih@gmail.com>\r\nCo-authored-by: Noam Gat <noamgat@gmail.com>\r\nCo-authored-by: Philipp Moritz <pcmoritz@gmail.com>\r\nCo-authored-by: youkaichao <youkaichao@gmail.com>\r\nCo-authored-by: Alexei-V-Ivanov-AMD <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com>\r\nCo-authored-by: Austin Veselka <50646302+FurtherAI@users.noreply.github.com>\r\nCo-authored-by: leiwen83 <leiwen83@users.noreply.github.com>\r\nCo-authored-by: Lei Wen <wenlei03@qiyi.com>\r\nCo-authored-by: Cody Yu <hao.yu.cody@gmail.com>\r\nCo-authored-by: SangBin Cho <rkooo567@gmail.com>\r\nCo-authored-by: DefTruth <31974251+DefTruth@users.noreply.github.com>\r\nCo-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>\r\nCo-authored-by: Antoni Baum <antoni.baum@protonmail.com>\r\nCo-authored-by: alexm-nm <59768536+alexm-nm@users.noreply.github.com>\r\nCo-authored-by: Mahmoud Ashraf <hassouna97.ma@gmail.com>\r\nCo-authored-by: Michael Goin <michael@neuralmagic.com>\r\nCo-authored-by: kliuae <17350011+kliuae@users.noreply.github.com>\r\nCo-authored-by: miloice <jeffaw99@hotmail.com>\r\nCo-authored-by: Hao Zhang <152229491+sfc-gh-hazhang@users.noreply.github.com>\r\nCo-authored-by: Dash Desai <1723932+iamontheinet@users.noreply.github.com>\r\nCo-authored-by: Aurick Qiao <qiao@aurick.net>\r\nCo-authored-by: Aurick Qiao <aurick.qiao@snowflake.com>\r\nCo-authored-by: Aurick Qiao <aurickq@users.noreply.github.com>\r\nCo-authored-by: Allen.Dou <allen.dou@hotmail.com>\r\nCo-authored-by: Kunshang Ji <kunshang.ji@intel.com>\r\nCo-authored-by: Steve Grubb <ausearch.1@gmail.com>\r\nCo-authored-by: heeju-kim2 <157340754+heeju-kim2@users.noreply.github.com>\r\nCo-authored-by: Chang Su <chang.s.su@oracle.com>\r\nCo-authored-by: Yikang Shen <yikang.shn@gmail.com>\r\nCo-authored-by: Swapnil Parekh <swapnilbp100@gmail.com>\r\nCo-authored-by: Sanger Steel <sangersteel@gmail.com>\r\nCo-authored-by: Stephen Krider <72541272+skrider@users.noreply.github.com>\r\nCo-authored-by: LiuXiaoxuanPKU <lilyliupku@gmail.com>\r\nCo-authored-by: Zhuohan Li <zhuohan123@gmail.com>\r\nCo-authored-by: Kuntai Du <kuntai@uchicago.edu>\r\nCo-authored-by: Nick Hill <nickhill@us.ibm.com>\r\nCo-authored-by: SAHIL SUNEJA <suneja@us.ibm.com>\r\nCo-authored-by: zifeitong <zifeitong@gmail.com>\r\nCo-authored-by: Alex Wu <itswu.alex@gmail.com>\r\nCo-authored-by: Cade Daniel <cade@anyscale.com>\r\nCo-authored-by: Jinzhen Lin <linjinzhen@hotmail.com>\r\nCo-authored-by: Alex Wu <alexanderwu@berkeley.edu>\r\nCo-authored-by: Pierre Dulac <dulacpier@gmail.com>\r\nCo-authored-by: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com>\r\nCo-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>\r\nCo-authored-by: Silencio <19430328+Silencioo@users.noreply.github.com>\r\nCo-authored-by: Silencio <silencio@adsl-99-6-187-6.dsl.irvnca.sbcglobal.net>\r\nCo-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>\r\nCo-authored-by: Kante Yin <kerthcet@gmail.com>\r\nCo-authored-by: bofeng huang <bofenghuang7@gmail.com>\r\nCo-authored-by: eigenLiu <33959526+eigen2017@users.noreply.github.com>\r\nCo-authored-by: alexeykondrat <143633163+alexeykondrat@users.noreply.github.com>\r\nCo-authored-by: Ubuntu <ubuntu@ip-10-10-40-24.ec2.internal>\r\nCo-authored-by: Domenic Barbuzzi <domenic@neuralmagic.com>",
-          "timestamp": "2024-06-03T15:22:54Z",
-          "url": "https://github.com/neuralmagic/nm-vllm/commit/fec35636296a2dfad0880699e061918f64a3b9d6"
+          "id": "fdd9a3143946895e4cec17ba9db28937d48cdfd9",
+          "message": "add latest tag for release docker image (#279)\n\nNeed to tag the latest for the release docker image\r\n\r\n---------\r\n\r\nCo-authored-by: dhuangnm <dhuang@MacBook-Pro-2.local>",
+          "timestamp": "2024-06-03T20:14:54Z",
+          "url": "https://github.com/neuralmagic/nm-vllm/commit/fdd9a3143946895e4cec17ba9db28937d48cdfd9"
         },
-        "date": 1717436914272,
+        "date": 1717486794145,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1826.7876810000416,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1968.3447554998565,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:03:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:19:54 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 92.30180574333039,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 94.80455332328651,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:03:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:19:54 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 57.09178000006432,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 57.64442700046857,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:03:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:19:54 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.494657948635467,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.541103052491863,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:03:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:19:54 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.822835040136072,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.871362886241304,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:03:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:19:54 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2577.4123235000843,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6106.36480050016,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:35:17 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:18:56 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 120.01311407067139,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 120.76388846999937,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:35:17 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:18:56 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 83.47934300036286,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 77.77009599999474,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:35:17 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:18:56 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18.917851642837732,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.12390406482,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:35:17 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:18:56 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.005374696378382,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.70404936507413,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:35:17 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:18:56 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 7477.170930999932,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2072.402735500418,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:03:03 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:13:38 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 155.60422404133442,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 82.25535124000089,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:03:03 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:13:38 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 126.69303699999546,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41.30683899984433,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:03:03 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:13:38 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 56.154255656328566,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.805719611744376,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:03:03 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:13:38 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 55.70570047176915,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.947181879927312,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:03:03 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:13:38 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5792.872616500063,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 3709.8409445006837,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:30:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:45:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 119.60485665071124,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 143.5810374200385,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:30:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:45:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 88.6595430001762,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 93.83627799979877,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:30:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:45:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 52.57606963795908,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.84327153864837,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:30:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:45:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 45.77184932528473,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.100691502900332,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:30:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:45:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6359.727259500005,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 15567.694932500217,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 12:49:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:07:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 111.62061250000306,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 211.95435407732356,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 12:49:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:07:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 71.44836100002294,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 170.61118799983888,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 12:49:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:07:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.14834417896245,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 125.28643066526487,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 12:49:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:07:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.866016742486394,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 130.88409230534404,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 12:49:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:07:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2027.7279659999294,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1885.0982009998916,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:23:01 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:54:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 81.131375346716,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 92.19699156666744,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:23:01 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:54:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38.80655000011757,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.964666000065336,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:23:01 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:54:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.574741119740576,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.972681839311665,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:23:01 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:54:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.62783674047078,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.240989983625319,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:23:01 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:54:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5086.0681250005655,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1985.1058935000765,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:23:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:48:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 90.60802515331791,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 78.90430167327699,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:23:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:48:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 63.84449399956793,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.71939650020795,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:23:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:48:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.191637028120056,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.379385445370405,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:23:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:48:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 30.83664939242367,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.469891548717342,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:23:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:48:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13438.462082000115,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67769.83289000054,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:41:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:03:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 207.69161295667195,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 29962.627152307323,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:41:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:03:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 162.68896749988926,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 28717.597102499894,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:41:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:03:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 104.1323195408694,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 190.42693975378825,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:41:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:03:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 107.03430961238783,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 201.70061792719784,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:41:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:03:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2410.390771999573,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 62892.62237150001,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:09:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:02:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 116.01512438667001,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40386.830530422,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:09:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:02:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.54676449955878,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40184.519388000124,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:09:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:02:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.631069397802065,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 99.63145586858826,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:09:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:02:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.065749860137487,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 101.68223204445646,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:09:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:02:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58115.24761200053,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2502.4847879994923,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:12:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:00:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24693.190178945297,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 115.04334467331257,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:12:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:00:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 22696.127297999737,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.55580650034244,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:12:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:00:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 182.07596480055477,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18.610256669780377,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:12:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:00:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 192.91163983201352,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.822594410864983,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:12:08 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:00:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3638.090063499476,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5157.87423349866,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 14:54:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:14:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 141.89584527663706,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 90.3476442533065,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 14:54:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:14:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 92.82084300048155,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 63.08617849936127,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 14:54:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:14:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.30408247226253,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.67354362422549,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 14:54:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:14:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 23.61374608368908,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31.275710766273264,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 14:54:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:14:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1942.1043275001466,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 257234.431072,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:57:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:42:00 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.1003962000262,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 243333.00650803535,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:57:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:42:00 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38.76818800017645,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 243501.93066449993,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:57:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:42:00 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.053056741675453,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 70.05580158040362,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:57:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:42:00 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.178593173806847,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67.11879649065315,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:57:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:42:00 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1919.03614299963,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5429.674158000125,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:29:15 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:53:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 95.22951738337119,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 183.65393957596098,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:29:15 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:53:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59.753891499440215,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 172.55695099993318,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:29:15 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:53:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.171030743815644,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.96852669778389,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:29:15 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:53:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.444907479944426,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37.996983684938925,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:29:15 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:53:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6011.056845000212,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2651.403954000216,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:27:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:25:58 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 119.51665396332828,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 119.47014652266928,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:27:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:25:58 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 76.40881049997006,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 84.83390100036559,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:27:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:25:58 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.447607174651544,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19.836291896778306,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:27:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:25:58 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.04213773936851,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.805192921584684,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:27:53 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:25:58 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6010.300804999929,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6169.347012999879,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 12:56:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:11:50 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 126.94023836000534,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 104.59407726000487,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 12:56:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:11:50 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 86.4834220000148,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.40293249979368,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 12:56:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:11:50 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.24737687956218,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34.93313201739429,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 12:56:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:11:50 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.325393796500656,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.38531133968375,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 12:56:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:11:50 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5231.110777500362,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 79578.35537449978,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:02:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:27:17 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 180.19965554800243,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59577.81482764667,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:02:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:27:17 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 169.80007999973168,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 72953.15405799988,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:02:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:27:17 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38.8138107661964,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67.84221377872534,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:02:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:27:17 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.10993344656062,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.01800222368406,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:02:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:27:17 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6102.600778999886,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6105.109807499957,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:20:47 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:47:48 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 105.56953884001207,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 127.2460156533369,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:20:47 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:47:48 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.45767050031282,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 88.1446569999298,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:20:47 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:47:48 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34.50484903749174,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.946839869342064,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:20:47 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:47:48 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.01785497536404,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.72338793562835,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:20:47 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:47:48 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13024.152943000445,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17937.872257999516,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:37:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:29:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 166.09906876733658,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1133.8330244626788,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:37:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:29:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 130.501374999767,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 227.2085975000664,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:37:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:29:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 116.45557543230827,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 156.57282078697003,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:37:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:29:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 104.93449016074172,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 143.8903228539812,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-03 15:37:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:29:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 73612.5902235001,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16867.69092550003,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:36:19 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:32:48 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 54508.354054376,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 226.9577794733262,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:36:19 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:32:48 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66708.06425700017,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 177.5287084997217,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:36:19 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:32:48 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 67.84795469274688,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 138.74064695416712,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:36:19 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:32:48 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 64.49598703678805,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 144.08242617070047,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:36:19 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:32:48 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 57355.77882799987,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7964.563253999927,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:11:21 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:54:22 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35571.797603253995,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 156.58499460933248,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:11:21 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:54:22 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35350.50049350002,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 132.4896505001334,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:11:21 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:54:22 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 96.58072383981926,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59.35169198589472,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:11:21 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:54:22 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 98.55851647084337,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.84237228350271,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:11:21 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:54:22 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 249174.2924559994,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6431.401653999956,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:50:59 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:41:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 235009.30902119138,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 110.18634970667638,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:50:59 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:41:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 236050.11426999955,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.29239400002507,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:50:59 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:41:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.79708311294138,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.63681207882916,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:50:59 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:41:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65.43778767154,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37.317072432491216,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 13:50:59 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:41:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11550.040795000314,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5993.155903500337,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:16:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:21:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 191.73528062333148,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 119.28509318669967,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:16:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:21:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 148.74351900016336,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 90.65418249974755,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:16:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:21:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 87.77569215848233,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 54.36501160974001,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:16:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:21:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 89.93932269039736,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 47.33126700501665,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-03 14:16:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:21:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
@@ -92862,668 +92862,668 @@ window.BENCHMARK_DATA = {
           "timestamp": "2024-06-03T20:14:54Z",
           "url": "https://github.com/neuralmagic/nm-vllm/commit/fdd9a3143946895e4cec17ba9db28937d48cdfd9"
         },
-        "date": 1717486794145,
+        "date": 1717486834964,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1968.3447554998565,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6440.998319999949,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:19:54 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:38:21 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 94.80455332328651,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 110.74126664666437,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:19:54 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:38:21 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 57.64442700046857,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 70.40421099998184,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:19:54 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:38:21 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.541103052491863,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.68560751570767,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:19:54 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:38:21 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.871362886241304,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37.37614568889179,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:19:54 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:38:21 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6106.36480050016,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6106.371244000002,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:18:56 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:21:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 120.76388846999937,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 121.95562292398245,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:18:56 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:21:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 77.77009599999474,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 93.63212449989078,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:18:56 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:21:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.12390406482,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 55.39298069218799,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:18:56 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:21:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.70404936507413,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 48.46992398190285,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:18:56 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:21:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2072.402735500418,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1885.2783004999765,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:13:38 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:53:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 82.25535124000089,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 92.00666148663913,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:13:38 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:53:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 41.30683899984433,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59.026208000432234,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:13:38 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:53:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.805719611744376,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.060310247904635,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:13:38 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:53:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.947181879927312,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.305141051714045,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:13:38 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:53:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3709.8409445006837,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2532.953448999706,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:45:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:59:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 143.5810374200385,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 115.43352627734082,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:45:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:59:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 93.83627799979877,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 79.26540150037908,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:45:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:59:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.84327153864837,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18.779799559480068,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:45:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:59:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.100691502900332,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.013203426807557,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:45:50 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:59:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 15567.694932500217,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1991.5965344998767,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:07:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:47:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 211.95435407732356,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 77.52473940003863,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:07:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:47:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 170.61118799983888,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.75204699938695,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:07:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:47:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 125.28643066526487,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.398765246590358,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:07:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:47:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 130.88409230534404,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.453427574962406,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:07:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:47:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1885.0982009998916,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17652.83625300026,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:54:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:32:03 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 92.19699156666744,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 224.15647835066798,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:54:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:32:03 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.964666000065336,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 177.60944749988994,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:54:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:32:03 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.972681839311665,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 146.21477302456094,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:54:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:32:03 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.240989983625319,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 150.62782272179493,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:54:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:32:03 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1985.1058935000765,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65234.32594349993,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:48:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:59:29 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 78.90430167327699,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 42344.713461258674,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:48:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:59:29 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.71939650020795,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41879.766031500025,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:48:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:59:29 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.379385445370405,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 100.30181319539702,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:48:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:59:29 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.469891548717342,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 102.8601412556169,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:48:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:59:29 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 67769.83289000054,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5174.499678500069,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:03:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:14:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 29962.627152307323,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 91.2496823665909,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:03:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:14:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 28717.597102499894,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 63.416820999918855,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:03:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:14:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 190.42693975378825,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.91975227240757,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:03:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:14:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 201.70061792719784,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31.330177188455274,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:03:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:14:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 62892.62237150001,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6202.171018999934,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:02:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:08:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40386.830530422,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 105.24800641333363,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:02:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:08:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40184.519388000124,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67.38204749990473,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:02:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:08:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 99.63145586858826,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.04272655003397,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:02:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:08:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 101.68223204445646,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.523500103738975,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:02:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:08:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2502.4847879994923,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6108.172721500068,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:00:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:16:06 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 115.04334467331257,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 121.90046643668515,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:00:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:16:06 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.55580650034244,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.81508199984455,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:00:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:16:06 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18.610256669780377,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.19418075671441,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:00:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:16:06 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.822594410864983,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.85883292820574,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:00:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:16:06 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5157.87423349866,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 260871.4947394999,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:14:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:41:07 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 90.3476442533065,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 245982.4186649913,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:14:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:41:07 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 63.08617849936127,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 247559.58809399977,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:14:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:41:07 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.67354362422549,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 70.15954414937362,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:14:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:41:07 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 31.275710766273264,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67.58994837999971,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:14:28 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:41:07 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 257234.431072,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7992.203209499962,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:42:00 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:51:10 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 243333.00650803535,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 156.57673923466547,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:42:00 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:51:10 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 243501.93066449993,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 134.29543700010527,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:42:00 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:51:10 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 70.05580158040362,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.85843149885409,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:42:00 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:51:10 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 67.11879649065315,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.675191881449955,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:42:00 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:51:10 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5429.674158000125,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1968.6618504997568,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:53:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:19:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 183.65393957596098,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 95.32872556998578,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:53:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:19:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 172.55695099993318,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59.17519899958279,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:53:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:19:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.96852669778389,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.543744792374257,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:53:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:19:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37.996983684938925,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.818288283459308,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:53:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:19:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2651.403954000216,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5493.706313000075,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:25:58 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:52:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 119.47014652266928,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 182.78887714270485,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:25:58 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:52:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 84.83390100036559,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 172.23564850064577,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:25:58 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:52:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19.836291896778306,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41.06861954781493,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:25:58 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:52:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.805192921584684,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38.19143516996267,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:25:58 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:52:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6169.347012999879,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2072.734379499707,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:11:50 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:13:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 104.59407726000487,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 81.18704796005962,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:11:50 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:13:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.40293249979368,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.60844400035057,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:11:50 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:13:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34.93313201739429,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.796053535716636,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:11:50 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:13:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.38531133968375,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.938045209799384,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:11:50 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:13:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 79578.35537449978,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 81030.78548899999,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:27:17 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:26:19 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59577.81482764667,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 60722.77092625867,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:27:17 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:26:19 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 72953.15405799988,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 74389.78645499992,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:27:17 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:26:19 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 67.84221377872534,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69.50143122540763,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:27:17 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:26:19 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.01800222368406,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.67409435002708,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:27:17 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:26:19 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6105.109807499957,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 15800.433527000223,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:47:48 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:06:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 127.2460156533369,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 213.21391819934192,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:47:48 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:06:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 88.1446569999298,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 171.30661549981596,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:47:48 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:06:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.946839869342064,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 127.86957135761493,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:47:48 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:06:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.72338793562835,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 132.48619996002597,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:47:48 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:06:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17937.872257999516,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2631.345844000407,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:29:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:25:09 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1133.8330244626788,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 119.99958248667583,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:29:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:25:09 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 227.2085975000664,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 85.60364950017174,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:29:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:25:09 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 156.57282078697003,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19.76793531848235,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:29:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:25:09 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 143.8903228539812,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.83319020247417,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:29:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:25:09 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16867.69092550003,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18187.496590000592,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:32:48 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:28:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 226.9577794733262,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1217.637657930638,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:32:48 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:28:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 177.5287084997217,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 238.2256780001626,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:32:48 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:28:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 138.74064695416712,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 156.7914285805437,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:32:48 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:28:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 144.08242617070047,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 144.79020775133904,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:32:48 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:28:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 7964.563253999927,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 3693.1756849999147,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:54:22 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:45:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 156.58499460933248,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 143.14783481670625,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:54:22 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:45:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 132.4896505001334,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 93.96243850005703,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:54:22 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:45:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59.35169198589472,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.907789175550537,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:54:22 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:45:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.84237228350271,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.142332322012734,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:54:22 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:45:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6431.401653999956,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6129.74736849992,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:41:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:44:35 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 110.18634970667638,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 128.53408270666893,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:41:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:44:35 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.29239400002507,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 87.85616800014395,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:41:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:44:35 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.63681207882916,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41.06100588782548,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:41:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:44:35 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37.317072432491216,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.85566850307434,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:41:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:44:35 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5993.155903500337,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69548.69400049938,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:21:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:02:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 119.28509318669967,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31010.123360934667,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:21:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:02:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 90.65418249974755,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 29984.455719500147,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:21:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:02:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 54.36501160974001,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 191.6870091569195,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:21:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:02:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 47.33126700501665,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 202.79108694443562,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:21:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:02:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
@@ -93544,668 +93544,668 @@ window.BENCHMARK_DATA = {
           "timestamp": "2024-06-03T20:14:54Z",
           "url": "https://github.com/neuralmagic/nm-vllm/commit/fdd9a3143946895e4cec17ba9db28937d48cdfd9"
         },
-        "date": 1717486834964,
+        "date": 1717486879069,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6440.998319999949,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5454.024124500393,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:38:21 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:54:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 110.74126664666437,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 183.08615106667276,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:38:21 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:54:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 70.40421099998184,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 171.68465850045322,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:38:21 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:54:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.68560751570767,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41.27397344693836,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:38:21 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:54:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37.37614568889179,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38.49306999706456,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:38:21 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:54:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6106.371244000002,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17870.203808500264,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:21:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:33:16 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 121.95562292398245,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 227.24141396668105,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:21:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:33:16 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 93.63212449989078,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 185.04103700024643,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:21:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:33:16 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 55.39298069218799,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 148.66718580639292,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:21:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:33:16 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 48.46992398190285,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 154.2060986545837,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:21:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:33:16 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1885.2783004999765,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18723.386613999537,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:53:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:29:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 92.00666148663913,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1533.9662729066622,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:53:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:29:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59.026208000432234,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 263.36673950027034,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:53:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:29:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.060310247904635,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 158.0342609946288,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:53:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:29:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.305141051714045,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 146.72360159908413,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:53:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:29:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2532.953448999706,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65868.51025700013,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:59:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:02:05 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 115.43352627734082,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 42952.089562826,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:59:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:02:05 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 79.26540150037908,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 42655.900271499944,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:59:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:02:05 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18.779799559480068,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 100.90905274563143,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:59:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:02:05 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.013203426807557,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 103.3437727937067,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:59:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:02:05 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1991.5965344998767,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1888.4365559997605,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:47:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:54:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 77.52473940003863,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 92.1348959600103,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:47:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:54:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.75204699938695,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59.01629649952156,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:47:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:54:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.398765246590358,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.058869849482795,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:47:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:54:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.453427574962406,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.30148967918306,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:47:35 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:54:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17652.83625300026,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2633.9105729998664,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:32:03 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:26:22 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 224.15647835066798,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 120.04579430002681,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:32:03 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:26:22 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 177.60944749988994,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 83.63644000019121,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:32:03 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:26:22 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 146.21477302456094,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19.879160254872048,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:32:03 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:26:22 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 150.62782272179493,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.896625118275395,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:32:03 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:26:22 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65234.32594349993,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5156.289227500565,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:59:29 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:14:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 42344.713461258674,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 91.94653347339529,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:59:29 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:14:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 41879.766031500025,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 64.3701980006881,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:59:29 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:14:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 100.30181319539702,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.684926592953,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:59:29 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:14:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 102.8601412556169,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31.242309537617327,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:59:29 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:14:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5174.499678500069,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7986.295465000012,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:14:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:53:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 91.2496823665909,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 158.19233767466963,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:14:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:53:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 63.416820999918855,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 129.38263649994042,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:14:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:53:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.91975227240757,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.88789051677662,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:14:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:53:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 31.330177188455274,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.71754815822603,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:14:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:53:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6202.171018999934,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2075.414781500058,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:08:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:13:48 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 105.24800641333363,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.51745272664145,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:08:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:13:48 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 67.38204749990473,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.26855750023606,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:08:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:13:48 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.04272655003397,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.888373822442976,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:08:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:13:48 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.523500103738975,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.977945790411958,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:08:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:13:48 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6108.172721500068,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 3699.253162501009,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:16:06 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:46:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 121.90046643668515,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 144.4739477399647,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:16:06 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:46:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.81508199984455,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 93.98442349993275,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:16:06 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:46:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.19418075671441,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.714238157704624,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:16:06 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:46:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.85883292820574,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.115445349113315,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:16:06 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:46:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 260871.4947394999,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 259145.63174399995,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:41:07 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:42:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 245982.4186649913,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 244845.86074828464,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:41:07 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:42:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 247559.58809399977,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 246173.34830100002,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:41:07 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:42:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 70.15954414937362,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 70.40816401919123,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:41:07 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:42:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 67.58994837999971,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67.40508523894187,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:41:07 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:42:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 7992.203209499962,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 78886.52240450005,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:51:10 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:27:18 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 156.57673923466547,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59012.254815819986,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:51:10 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:27:18 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 134.29543700010527,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 72044.4028244999,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:51:10 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:27:18 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.85843149885409,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.02510526002537,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:51:10 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:27:18 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.675191881449955,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65.75853430024881,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:51:10 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:27:18 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1968.6618504997568,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 15903.70668750029,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:19:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:07:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 95.32872556998578,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 213.13931954268506,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:19:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:07:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59.17519899958279,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 167.40585550041942,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:19:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:07:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.543744792374257,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 128.85636857822524,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:19:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:07:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.818288283459308,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 133.88485618959274,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:19:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:07:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5493.706313000075,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6040.232987499621,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:52:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:22:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 182.78887714270485,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 118.90891652400994,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:52:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:22:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 172.23564850064577,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 87.51811549973354,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:52:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:22:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 41.06861954781493,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 54.70704003253289,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:52:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:22:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38.19143516996267,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 47.58010815469473,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:52:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:22:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2072.734379499707,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6112.463483500051,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:13:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:47:09 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 81.18704796005962,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 128.97861878998697,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:13:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:47:09 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.60844400035057,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 91.68460800003686,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:13:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:47:09 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.796053535716636,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.88820375273403,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:13:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:47:09 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.938045209799384,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.82786688691618,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:13:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:47:09 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 81030.78548899999,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6182.661464499915,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:26:19 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:11:49 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 60722.77092625867,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 105.88058805329638,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:26:19 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:11:49 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 74389.78645499992,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.79186150013084,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:26:19 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:11:49 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69.50143122540763,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.09224326144527,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:26:19 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:11:49 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.67409435002708,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.473210161276306,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:26:19 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:11:49 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 15800.433527000223,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69734.58482200021,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:06:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:03:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 213.21391819934192,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31249.82819040137,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:06:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:03:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 171.30661549981596,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 30564.476220500183,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:06:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:03:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 127.86957135761493,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 192.5960953923817,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:06:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:03:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 132.48619996002597,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 203.3323966884214,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:06:27 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:03:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2631.345844000407,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1993.6342444998445,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:25:09 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:48:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 119.99958248667583,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 78.23711747331497,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:25:09 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:48:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 85.60364950017174,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.73105949944511,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:25:09 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:48:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19.76793531848235,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.430836418906924,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:25:09 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:48:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.83319020247417,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.514256438506221,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:25:09 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:48:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18187.496590000592,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6428.612557000008,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:28:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:40:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1217.637657930638,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 110.42314971332341,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:28:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:40:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 238.2256780001626,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67.48310099999344,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:28:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:40:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 156.7914285805437,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.62859159317332,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:28:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:40:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 144.79020775133904,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37.34685298628225,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:28:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:40:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3693.1756849999147,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1975.9936359996573,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:45:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:20:03 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 143.14783481670625,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 95.27948016336268,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:45:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:20:03 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 93.96243850005703,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 61.18415350010764,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:45:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:20:03 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.907789175550537,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.555750750491628,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:45:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:20:03 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.142332322012734,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.861979915334244,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:45:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:20:03 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6129.74736849992,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6117.3322049999115,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:44:35 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:18:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 128.53408270666893,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 121.90177221667757,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:44:35 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:18:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 87.85616800014395,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.22100400012278,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:44:35 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:18:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 41.06100588782548,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.319982478706756,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:44:35 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:18:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.85566850307434,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.79184074360817,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:44:35 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:18:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69548.69400049938,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2506.0447780006143,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:02:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:00:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 31010.123360934667,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 115.720866174676,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:02:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:00:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 29984.455719500147,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.5898585003888,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:02:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:00:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 191.6870091569195,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18.703319024563818,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:02:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:00:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 202.79108694443562,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.95419341546464,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:02:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:00:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
@@ -94226,1350 +94226,1350 @@ window.BENCHMARK_DATA = {
           "timestamp": "2024-06-03T20:14:54Z",
           "url": "https://github.com/neuralmagic/nm-vllm/commit/fdd9a3143946895e4cec17ba9db28937d48cdfd9"
         },
-        "date": 1717486879069,
+        "date": 1717486899331,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5454.024124500393,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5785.776880499725,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:54:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:24:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 183.08615106667276,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 118.92931541067568,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:54:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:24:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 171.68465850045322,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 90.1869515009821,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:54:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:24:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 41.27397344693836,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 52.67676156071734,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:54:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:24:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38.49306999706456,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 45.66763363645848,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:54:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:24:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17870.203808500264,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 55960.890349500005,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:33:16 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:06:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 227.24141396668105,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34505.53588642667,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:33:16 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:06:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 185.04103700024643,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34309.34696250006,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:33:16 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:06:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 148.66718580639292,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 95.76908291292905,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:33:16 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:06:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 154.2060986545837,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 98.35385544190784,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:33:16 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:06:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18723.386613999537,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6048.551416500004,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:29:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:23:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1533.9662729066622,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 120.46771771000446,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:29:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:23:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 263.36673950027034,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 78.17563349999546,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:29:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:23:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 158.0342609946288,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.69668735925634,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:29:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:23:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 146.72360159908413,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.31330238613524,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:29:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:23:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65868.51025700013,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1927.273313000569,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:02:05 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:24:01 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 42952.089562826,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 93.58514570668073,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:02:05 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:24:01 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 42655.900271499944,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 57.797591000507964,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:02:05 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:24:01 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 100.90905274563143,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.246371649076861,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:02:05 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:24:01 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 103.3437727937067,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.561082803845053,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:02:05 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:24:01 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1888.4365559997605,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2554.296359000091,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:54:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:30:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 92.1348959600103,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 118.69095240934499,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:54:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:30:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59.01629649952156,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 83.05938650028111,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:54:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:30:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.058869849482795,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19.02549530720225,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:54:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:30:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.30148967918306,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.146301608164475,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:54:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:30:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2633.9105729998664,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 57331.99107350083,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:26:22 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:05:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 120.04579430002681,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24318.87264188733,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:26:22 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:05:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 83.63644000019121,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 22855.887930999415,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:26:22 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:05:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19.879160254872048,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 180.4469279026636,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:26:22 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:05:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.896625118275395,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 191.98281539278796,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:26:22 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:05:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5156.289227500565,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6119.579037499989,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:14:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:15:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 91.94653347339529,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 104.05635790664746,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:14:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:15:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 64.3701980006881,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.90187099979994,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:14:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:15:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.684926592953,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34.697206180365264,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:14:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:15:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 31.242309537617327,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.21231925674887,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:14:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:15:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 7986.295465000012,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6024.258447000079,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:53:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:51:57 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 158.19233767466963,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 126.78459783999871,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:53:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:51:57 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 129.38263649994042,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 81.81675449998238,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:53:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:51:57 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.88789051677662,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.272888658516614,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:53:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:51:57 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.71754815822603,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.24340105675399,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:53:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:51:57 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2075.414781500058,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12620.563315500476,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:13:48 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:30:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.51745272664145,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 167.80186781333032,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:13:48 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:30:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.26855750023606,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 129.6815070008961,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:13:48 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:30:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.888373822442976,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 113.07529442774695,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:13:48 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:30:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.977945790411958,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 102.00880712124115,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:13:48 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:30:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3699.253162501009,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 250191.674357,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:46:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:45:44 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 144.4739477399647,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 236125.77282334934,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:46:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:45:44 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 93.98442349993275,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 236989.25359399983,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:46:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:45:44 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.714238157704624,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.64188447174384,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:46:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:45:44 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.115445349113315,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65.79395197603682,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:46:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:45:44 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 259145.63174399995,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1853.0404435000492,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:42:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:58:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 244845.86074828464,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 91.11207146999732,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:42:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:58:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 246173.34830100002,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 60.19081950034888,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:42:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:58:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 70.40816401919123,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.796614877419039,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:42:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:58:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 67.40508523894187,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.038254724045085,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:42:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:58:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 78886.52240450005,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 74282.63615700007,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:27:18 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:31:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59012.254815819986,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 55073.640529414675,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:27:18 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:31:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 72044.4028244999,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67166.00807750001,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:27:18 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:31:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.02510526002537,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.38270194958277,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:27:18 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:31:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65.75853430024881,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 64.41100075899841,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:27:18 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:31:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 15903.70668750029,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2391.326898999523,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:07:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:04:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 213.13931954268506,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 114.25296086932576,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:07:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:04:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 167.40585550041942,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 79.60975599962694,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:07:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:04:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 128.85636857822524,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.76224480426048,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:07:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:04:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 133.88485618959274,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.083277643203157,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:07:19 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:04:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6040.232987499621,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1966.413384000134,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:22:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:52:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 118.90891652400994,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 77.18339140666407,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:22:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:52:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 87.51811549973354,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37.758860000394634,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:22:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:52:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 54.70704003253289,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.235902758044592,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:22:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:52:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 47.58010815469473,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.286106823696699,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:22:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:52:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6112.463483500051,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7498.087665499952,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:47:09 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:58:29 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 128.97861878998697,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 155.2915395866703,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:47:09 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:58:29 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 91.68460800003686,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 124.13391999996293,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:47:09 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:58:29 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.88820375273403,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 56.17845654502635,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:47:09 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:58:29 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.82786688691618,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 55.71056960190177,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:47:09 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:58:29 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6182.661464499915,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6369.020522500023,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:11:49 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:45:46 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 105.88058805329638,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 109.99698670667082,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:11:49 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:45:46 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.79186150013084,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 70.12162149999313,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:11:49 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:45:46 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.09224326144527,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.287103541900656,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:11:49 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:45:46 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.473210161276306,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.95729965848386,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:11:49 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:45:46 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69734.58482200021,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11410.404378999829,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:03:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:11:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 31249.82819040137,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 188.55270861600667,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:03:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:11:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 30564.476220500183,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 146.18165550018603,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:03:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:11:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 192.5960953923817,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 86.01591651072756,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:03:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:11:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 203.3323966884214,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 88.16611659868101,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:03:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:11:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1993.6342444998445,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12309.922719000042,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:48:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:36:50 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 78.23711747331497,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 202.7500242380041,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:48:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:36:50 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.73105949944511,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 154.75827799991748,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:48:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:36:50 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.430836418906924,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 96.52570002857267,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:48:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:36:50 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.514256438506221,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 98.80388002495819,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:48:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:36:50 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6428.612557000008,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5111.47914150024,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:40:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:16:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 110.42314971332341,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 88.57914246011205,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:40:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:16:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 67.48310099999344,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59.33024400019349,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:40:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:16:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.62859159317332,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.41309699864617,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:40:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:16:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37.34685298628225,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 30.981834447155244,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:40:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:16:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1975.9936359996573,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2054.349911999452,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:20:03 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:17:46 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 95.27948016336268,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 79.28242346670837,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:20:03 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:17:46 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 61.18415350010764,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.542881499983196,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:20:03 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:17:46 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.555750750491628,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.664829629106839,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:20:03 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:17:46 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.861979915334244,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.797083416327892,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:20:03 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:17:46 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6117.3322049999115,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 3659.3875905009554,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:18:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:47:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 121.90177221667757,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 142.56452539669831,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:18:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:47:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.22100400012278,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 89.14418849963113,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:18:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:47:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.319982478706756,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.47520356026676,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:18:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:47:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.79184074360817,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 23.816849227068797,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:18:57 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:47:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2506.0447780006143,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5258.380299500459,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:00:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:56:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 115.720866174676,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 181.02851416803605,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:00:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:56:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.5898585003888,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 171.04110500076786,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:00:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:56:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18.703319024563818,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.12063777152027,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:00:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:56:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.95419341546464,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.32876665983879,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:00:34 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:56:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
       {
         "commit": {
           "author": {
-            "name": "dhuangnm",
-            "username": "dhuangnm",
-            "email": "74931910+dhuangnm@users.noreply.github.com"
+            "name": "Michael Goin",
+            "username": "mgoin",
+            "email": "michael@neuralmagic.com"
           },
           "committer": {
             "name": "GitHub",
             "username": "web-flow",
             "email": "noreply@github.com"
           },
-          "id": "fdd9a3143946895e4cec17ba9db28937d48cdfd9",
-          "message": "add latest tag for release docker image (#279)\n\nNeed to tag the latest for the release docker image\r\n\r\n---------\r\n\r\nCo-authored-by: dhuangnm <dhuang@MacBook-Pro-2.local>",
-          "timestamp": "2024-06-03T20:14:54Z",
-          "url": "https://github.com/neuralmagic/nm-vllm/commit/fdd9a3143946895e4cec17ba9db28937d48cdfd9"
+          "id": "0257d9df154c888e4e47cbbb7386dad7c3ef100f",
+          "message": "Update README.md",
+          "timestamp": "2024-06-04T18:22:13Z",
+          "url": "https://github.com/neuralmagic/nm-vllm/commit/0257d9df154c888e4e47cbbb7386dad7c3ef100f"
         },
-        "date": 1717486899331,
+        "date": 1717572964196,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5785.776880499725,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 73755.18542200007,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:24:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:26:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 118.92931541067568,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 54781.31541236267,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:24:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:26:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 90.1869515009821,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66663.55898900019,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:24:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:26:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 52.67676156071734,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.39745048123163,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:24:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:26:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 45.66763363645848,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 64.54248457409597,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:24:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:26:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 55960.890349500005,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5274.142381000274,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:06:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:51:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34505.53588642667,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 181.30165391202172,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:06:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:51:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34309.34696250006,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 172.8570250006669,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:06:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:51:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 95.76908291292905,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.36751358898898,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:06:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:51:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 98.35385544190784,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.34859982848261,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:06:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:51:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6048.551416500004,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6376.852755000016,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:23:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:39:02 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 120.46771771000446,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 108.15523370000392,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:23:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:39:02 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 78.17563349999546,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65.50477400003274,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:23:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:39:02 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.69668735925634,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.33136610131439,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:23:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:39:02 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.31330238613524,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37.037428053801804,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:23:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:39:02 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1927.273313000569,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6031.7559444999915,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:24:01 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:45:13 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 93.58514570668073,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 126.05798475333054,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:24:01 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:45:13 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 57.797591000507964,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 82.35389249989566,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:24:01 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:45:13 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.246371649076861,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.506726089476274,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:24:01 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:45:13 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.561082803845053,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.345374117580604,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:24:01 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:45:13 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2554.296359000091,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1847.9448284997488,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:30:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:52:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 118.69095240934499,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 91.11867583997689,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:30:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:52:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 83.05938650028111,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.75746250012526,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:30:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:52:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19.02549530720225,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.720778409994828,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:30:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:52:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.146301608164475,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.957975231980813,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:30:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:52:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 57331.99107350083,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6115.664882499914,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:05:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:10:46 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24318.87264188733,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 105.67819980002848,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:05:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:10:46 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 22855.887930999415,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.5118680000869,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:05:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:10:46 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 180.4469279026636,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34.6407318778397,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:05:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:10:46 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 191.98281539278796,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.16146819897536,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:05:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:10:46 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6119.579037499989,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5123.928123499354,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:15:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:11:42 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 104.05635790664746,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 89.0871250799925,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:15:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:11:42 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.90187099979994,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59.15235899919935,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:15:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:11:42 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34.697206180365264,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.38484088215165,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:15:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:11:42 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.21231925674887,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 30.957168229789495,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:15:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:11:42 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6024.258447000079,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2053.7065545004225,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:51:57 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:11:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 126.78459783999871,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 79.75048137999693,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:51:57 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:11:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 81.81675449998238,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.944025999830046,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:51:57 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:11:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.272888658516614,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.684640059364112,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:51:57 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:11:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.24340105675399,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.797214167940783,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:51:57 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:11:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12620.563315500476,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2412.450625499787,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:30:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:58:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 167.80186781333032,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 114.58843513332977,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:30:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:58:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 129.6815070008961,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 79.94715549966713,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:30:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:58:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 113.07529442774695,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.97720822256229,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:30:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:58:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 102.00880712124115,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.207593782838163,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:30:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:58:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 250191.674357,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6051.014324999869,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:45:44 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:17:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 236125.77282334934,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 120.20469047333943,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:45:44 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:17:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 236989.25359399983,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 77.93934050005191,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:45:44 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:17:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.64188447174384,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.5328849296462,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:45:44 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:17:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65.79395197603682,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.22912378618021,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:45:44 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:17:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1853.0404435000492,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11522.47239799999,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:58:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:05:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 91.11207146999732,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 189.9282906139882,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:58:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:05:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 60.19081950034888,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 149.82469449978453,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:58:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:05:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.796614877419039,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 87.57810831842728,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:58:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:05:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.038254724045085,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 89.81603877024982,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:58:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:05:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 74282.63615700007,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 57308.29728949993,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:31:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:00:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 55073.640529414675,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24429.24578628534,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:31:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:00:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 67166.00807750001,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 22884.999396499552,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:31:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:00:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.38270194958277,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 181.12438337329968,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:31:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:00:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 64.41100075899841,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 192.18140868332165,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:31:16 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:00:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2391.326898999523,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12847.864813999877,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:04:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:30:26 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 114.25296086932576,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 202.18938731532884,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:04:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:30:26 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 79.60975599962694,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 156.56327750002674,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:04:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:30:26 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.76224480426048,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 100.64073281152855,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:04:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:30:26 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.083277643203157,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 103.13423978338325,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:04:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:30:26 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1966.413384000134,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5772.151614000904,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:52:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:18:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 77.18339140666407,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 115.26393409336985,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:52:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:18:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37.758860000394634,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 86.76324100088095,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:52:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:18:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.235902758044592,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 52.595730911464535,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:52:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:18:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.286106823696699,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 45.781826257653435,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 03:52:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:18:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 7498.087665499952,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12629.358229999525,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:58:29 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:26:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 155.2915395866703,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 162.56727568198158,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:58:29 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:26:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 124.13391999996293,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 128.46049300060258,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:58:29 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:26:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 56.17845654502635,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 112.88163107184694,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:58:29 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:26:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 55.71056960190177,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 101.52381771349468,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:58:29 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:26:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6369.020522500023,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 55257.30873649991,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:45:46 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:59:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 109.99698670667082,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34014.00603094267,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:45:46 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:59:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 70.12162149999313,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 33341.29117949999,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:45:46 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:59:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.287103541900656,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 95.60821701314372,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:45:46 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:59:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.95729965848386,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 97.78219893172137,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 02:45:46 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:59:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11410.404378999829,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1968.1786680002915,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:11:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:46:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 188.55270861600667,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 77.12485840001439,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:11:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:46:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 146.18165550018603,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38.845241499984695,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:11:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:46:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 86.01591651072756,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.227909963066596,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:11:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:46:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 88.16611659868101,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.32405219681264,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:11:23 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:46:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12309.922719000042,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7460.377201500023,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:36:50 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:51:45 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 202.7500242380041,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 155.0358747626657,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:36:50 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:51:45 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 154.75827799991748,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 133.64867150005466,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:36:50 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:51:45 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 96.52570002857267,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 55.98506018080652,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:36:50 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:51:45 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 98.80388002495819,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 55.65100405095286,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:36:50 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:51:45 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5111.47914150024,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 3654.1526315004376,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:16:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:43:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 88.57914246011205,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 142.80341375336016,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:16:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:43:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59.33024400019349,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 93.23372050039325,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:16:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:43:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.41309699864617,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.38280171831792,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:16:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:43:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 30.981834447155244,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 23.700130815657285,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 05:16:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:43:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2054.349911999452,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1929.135523500463,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:17:46 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:17:50 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 79.28242346670837,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 93.89699821668425,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:17:46 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:17:50 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.542881499983196,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.17522600000302,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:17:46 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:17:50 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.664829629106839,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.259917131621624,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:17:46 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:17:50 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.797083416327892,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.53502009480749,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-04 04:17:46 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:17:50 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3659.3875905009554,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2544.4823535003707,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:47:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:23:52 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 142.56452539669831,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 118.51974250535325,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:47:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:23:52 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 89.14418849963113,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 82.8322054999262,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:47:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:23:52 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.47520356026676,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18.874461210297007,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:47:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:23:52 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 23.816849227068797,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.96535923873111,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:47:17 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:23:52 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5258.380299500459,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 249719.7052050003,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:56:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:40:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 181.02851416803605,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 235743.81317350536,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:56:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:40:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 171.04110500076786,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 236612.42044949994,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:56:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:40:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.12063777152027,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.52977677315367,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:56:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:40:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.32876665983879,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65.70908300783768,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.4.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-04 04:56:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:40:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
@@ -95590,668 +95590,668 @@ window.BENCHMARK_DATA = {
           "timestamp": "2024-06-04T18:22:13Z",
           "url": "https://github.com/neuralmagic/nm-vllm/commit/0257d9df154c888e4e47cbbb7386dad7c3ef100f"
         },
-        "date": 1717572964196,
+        "date": 1717573182072,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 73755.18542200007,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1975.7934499998555,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:26:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:20:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 54781.31541236267,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 95.33599586665939,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:26:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:20:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66663.55898900019,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59.25566899941259,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:26:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:20:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.39745048123163,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.613003521163858,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:26:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:20:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 64.54248457409597,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.903863959951174,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:26:04 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:20:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5274.142381000274,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1990.530694500194,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:51:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:48:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 181.30165391202172,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 79.04017088661931,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:51:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:48:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 172.8570250006669,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.069916500215186,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:51:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:48:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.36751358898898,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.352486622843355,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:51:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:48:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.34859982848261,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.455696360151357,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:51:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:48:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6376.852755000016,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65623.97806299999,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:39:02 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:01:45 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 108.15523370000392,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 42548.72657927333,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:39:02 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:01:45 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65.50477400003274,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 42052.41047800007,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:39:02 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:01:45 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.33136610131439,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 100.42581577642753,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:39:02 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:01:45 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37.037428053801804,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 102.73442258920154,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:39:02 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:01:45 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6031.7559444999915,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5165.230333999716,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:45:13 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:14:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 126.05798475333054,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 89.18472634663583,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:45:13 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:14:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 82.35389249989566,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 60.3192204998777,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:45:13 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:14:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.506726089476274,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.74831494379642,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:45:13 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:14:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.345374117580604,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31.263469881805783,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:45:13 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:14:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1847.9448284997488,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 71233.34427600002,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:52:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:03:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 91.11867583997689,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31836.63932091001,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:52:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:03:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.75746250012526,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31021.99345149984,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:52:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:03:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.720778409994828,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 193.19795882911964,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:52:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:03:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.957975231980813,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 204.57356740026793,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:52:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:03:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6115.664882499914,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6422.19940900003,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:10:46 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:40:37 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 105.67819980002848,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 109.88173289333115,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:10:46 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:40:37 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.5118680000869,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67.78409450004119,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:10:46 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:40:37 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34.6407318778397,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.57983124137713,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:10:46 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:40:37 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.16146819897536,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37.267313983479774,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:10:46 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:40:37 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5123.928123499354,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6109.015791500269,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:11:42 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:19:02 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 89.0871250799925,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 122.68514260334541,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:11:42 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:19:02 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59.15235899919935,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 83.57903599994643,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:11:42 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:19:02 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.38484088215165,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.146640564830165,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:11:42 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:19:02 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 30.957168229789495,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.725523279960974,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:11:42 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:19:02 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2053.7065545004225,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 78529.7785214998,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:11:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:27:23 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 79.75048137999693,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58721.69160256535,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:11:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:27:23 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.944025999830046,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 71508.66835149986,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:11:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:27:23 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.684640059364112,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.09258718482668,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:11:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:27:23 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.797214167940783,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65.70630902656838,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:11:51 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:27:23 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2412.450625499787,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6104.897983000001,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:58:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:46:50 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 114.58843513332977,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 126.89376464333085,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:58:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:46:50 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 79.94715549966713,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 85.76336650003213,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:58:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:46:50 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.97720822256229,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41.06915336348656,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:58:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:46:50 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.207593782838163,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.874942475419786,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:58:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:46:50 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6051.014324999869,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 260933.7399484998,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:17:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:42:11 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 120.20469047333943,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 246348.652753672,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:17:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:42:11 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 77.93934050005191,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 247753.58967099988,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:17:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:42:11 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.5328849296462,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 71.52547558982853,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:17:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:42:11 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.22912378618021,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67.37765993345238,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:17:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:42:11 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11522.47239799999,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2068.365681999694,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:05:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:13:57 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 189.9282906139882,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.17974005337844,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:05:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:13:57 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 149.82469449978453,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.95567000013034,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:05:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:13:57 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 87.57810831842728,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.835997486112298,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:05:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:13:57 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 89.81603877024982,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.943342617564616,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:05:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:13:57 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 57308.29728949993,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18863.58120349996,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:00:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:29:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24429.24578628534,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1483.9681253440042,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:00:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:29:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 22884.999396499552,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 265.1577640008327,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:00:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:29:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 181.12438337329968,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 157.88222256506052,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:00:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:29:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 192.18140868332165,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 145.82300105146194,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:00:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:29:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12847.864813999877,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6040.10625599949,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:30:26 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:21:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 202.18938731532884,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 119.45821917999032,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:30:26 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:21:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 156.56327750002674,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 91.84418099994218,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:30:26 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:21:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 100.64073281152855,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 54.64609627110425,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:30:26 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:21:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 103.13423978338325,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 47.86997792578394,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:30:26 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:21:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5772.151614000904,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2505.3557870000986,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:18:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:00:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 115.26393409336985,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 115.1015805199798,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:18:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:00:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 86.76324100088095,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 81.43590049985505,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:18:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:00:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 52.595730911464535,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18.655124752041274,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:18:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:00:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 45.781826257653435,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.882650920331617,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:18:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:00:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12629.358229999525,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 15549.75253199973,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:26:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:07:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 162.56727568198158,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 212.969658120659,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:26:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:07:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 128.46049300060258,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 168.86559050044525,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:26:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:07:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 112.88163107184694,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 125.20656284526063,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:26:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:07:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 101.52381771349468,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 130.60245925788612,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:26:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:07:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 55257.30873649991,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5455.755123000927,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:59:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:54:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34014.00603094267,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 184.0125064293631,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:59:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:54:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 33341.29117949999,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 172.26980849954998,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:59:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:54:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 95.60821701314372,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41.34378620451925,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:59:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:54:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 97.78219893172137,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38.65310148491302,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:59:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:54:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1968.1786680002915,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2630.551425500016,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:46:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:26:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 77.12485840001439,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 120.0175972480065,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:46:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:26:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38.845241499984695,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 83.42846149980687,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:46:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:26:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.227909963066596,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19.839985249298476,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:46:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:26:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.32405219681264,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.839909343441953,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:46:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:26:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 7460.377201500023,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 3697.770496000885,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:51:45 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:46:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 155.0358747626657,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 144.23739775337404,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:51:45 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:46:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 133.64867150005466,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 95.34760749920679,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:51:45 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:46:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 55.98506018080652,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.870631533442317,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:51:45 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:46:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 55.65100405095286,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.040367541896167,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:51:45 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:46:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3654.1526315004376,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1896.827923500041,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:43:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:54:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 142.80341375336016,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 92.33098896669617,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:43:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:54:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 93.23372050039325,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 57.04740800001673,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:43:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:54:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.38280171831792,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.077754018439807,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:43:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:54:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 23.700130815657285,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.319771208280605,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:43:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:54:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1929.135523500463,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6178.988775500102,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:17:50 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:11:54 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 93.89699821668425,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 105.24486398669069,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:17:50 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:11:54 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.17522600000302,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.42057300013948,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:17:50 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:11:54 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.259917131621624,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.014338630567,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:17:50 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:11:54 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.53502009480749,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.53110167117665,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:17:50 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:11:54 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2544.4823535003707,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17629.28239049961,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:23:52 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:33:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 118.51974250535325,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 228.15737318399079,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:23:52 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:33:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 82.8322054999262,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 183.52132100017116,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:23:52 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:33:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18.874461210297007,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 146.45657947859132,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:23:52 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:33:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.96535923873111,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 152.18902241546186,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:23:52 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:33:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 249719.7052050003,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 8023.916034499962,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:40:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:53:25 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 235743.81317350536,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 157.39932492533202,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:40:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:53:25 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 236612.42044949994,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 130.8959225000308,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:40:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:53:25 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.52977677315367,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.86249245694424,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:40:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:53:25 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65.70908300783768,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.52022299866027,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:40:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:53:25 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
@@ -96272,668 +96272,668 @@ window.BENCHMARK_DATA = {
           "timestamp": "2024-06-04T18:22:13Z",
           "url": "https://github.com/neuralmagic/nm-vllm/commit/0257d9df154c888e4e47cbbb7386dad7c3ef100f"
         },
-        "date": 1717573182072,
+        "date": 1717573292382,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1975.7934499998555,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1997.095519499453,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:20:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:48:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 95.33599586665939,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 79.49410707335119,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:20:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:48:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59.25566899941259,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.92237950044364,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:20:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:48:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.613003521163858,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.423041893780688,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:20:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:48:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.903863959951174,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.522508858599581,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:20:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:48:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1990.530694500194,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 3708.8484034998146,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:48:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:43:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 79.04017088661931,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 144.70396740337796,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:48:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:43:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.069916500215186,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 96.86096099994757,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:48:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:43:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.352486622843355,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.751408917969854,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:48:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:43:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.455696360151357,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.16263272998981,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:48:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:43:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65623.97806299999,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 70348.65259299931,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:01:45 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:03:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 42548.72657927333,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31540.73703901402,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:01:45 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:03:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 42052.41047800007,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 30937.1006654992,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:01:45 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:03:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 100.42581577642753,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 193.07305639892365,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:01:45 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:03:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 102.73442258920154,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 204.23880160253765,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:01:45 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:03:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5165.230333999716,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6097.933693500181,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:14:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:19:03 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 89.18472634663583,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 121.86066675333373,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:14:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:19:03 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 60.3192204998777,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 84.7730484999829,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:14:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:19:03 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.74831494379642,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.20517006487294,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:14:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:19:03 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 31.263469881805783,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.686507297140324,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:14:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:19:03 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 71233.34427600002,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6452.393932000006,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:03:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:40:03 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 31836.63932091001,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 110.7267989666669,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:03:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:40:03 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 31021.99345149984,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 70.27968650004368,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:03:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:40:03 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 193.19795882911964,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.68642417315979,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:03:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:40:03 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 204.57356740026793,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37.35562801671746,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:03:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:40:03 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6422.19940900003,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6124.660605999907,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:40:37 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:46:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 109.88173289333115,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 128.36626168998959,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:40:37 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:46:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 67.78409450004119,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 84.43713399992703,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:40:37 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:46:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.57983124137713,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.95552879690462,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:40:37 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:46:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37.267313983479774,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.76947833644865,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:40:37 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:46:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6109.015791500269,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1887.9630804999579,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:19:02 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:54:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 122.68514260334541,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 91.94640251327655,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:19:02 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:54:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 83.57903599994643,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 60.55626250008572,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:19:02 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:54:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.146640564830165,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.012353670994822,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:19:02 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:54:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.725523279960974,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.249874644191436,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:19:02 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:54:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 78529.7785214998,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 79988.87884600004,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:27:23 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:27:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58721.69160256535,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59738.638552652,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:27:23 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:27:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 71508.66835149986,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 73359.15299499992,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:27:23 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:27:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.09258718482668,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.03593499119127,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:27:23 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:27:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65.70630902656838,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.07576343076235,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:27:23 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:27:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6104.897983000001,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6063.049027000488,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:46:50 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:21:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 126.89376464333085,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 119.42886400928546,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:46:50 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:21:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 85.76336650003213,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 89.7157725003126,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:46:50 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:21:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 41.06915336348656,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 54.8214165161257,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:46:50 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:21:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.874942475419786,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 48.03518404416397,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:46:50 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:21:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 260933.7399484998,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5198.608692498965,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:42:11 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:13:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 246348.652753672,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 90.76383455999651,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:42:11 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:13:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 247753.58967099988,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 62.78620149987546,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:42:11 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:13:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 71.52547558982853,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.83553935022538,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:42:11 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:13:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 67.37765993345238,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31.3345803907413,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:42:11 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:13:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2068.365681999694,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1966.553798500172,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:13:57 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:20:14 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.17974005337844,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 95.29166702000414,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:13:57 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:20:14 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.95567000013034,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.26092300048913,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:13:57 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:20:14 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.835997486112298,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.558316077371831,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:13:57 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:20:14 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.943342617564616,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.834920917726885,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:13:57 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:20:14 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18863.58120349996,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19222.069294000903,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:29:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:28:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1483.9681253440042,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1812.5878313566814,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:29:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:28:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 265.1577640008327,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 295.03822549941106,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:29:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:28:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 157.88222256506052,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 159.2759532426503,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:29:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:28:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 145.82300105146194,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 147.19713947191278,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:29:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:28:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6040.10625599949,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 259086.0955725002,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:21:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:42:10 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 119.45821917999032,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 244112.833214076,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:21:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:42:10 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 91.84418099994218,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 245980.72280350016,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:21:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:42:10 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 54.64609627110425,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 70.04524546886341,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:21:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:42:10 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 47.86997792578394,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67.15701582574867,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:21:58 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:42:10 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2505.3557870000986,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2071.589972500533,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:00:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:13:57 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 115.1015805199798,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 81.63230001999182,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:00:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:13:57 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 81.43590049985505,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41.06669199973112,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:00:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:13:57 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18.655124752041274,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.828980113139927,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:00:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:13:57 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.882650920331617,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.936821029889687,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:00:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:13:57 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 15549.75253199973,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2636.5531265000755,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:07:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:26:34 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 212.969658120659,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 120.24517320798866,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:07:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:26:34 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 168.86559050044525,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 81.84799849959745,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:07:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:26:34 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 125.20656284526063,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19.85569717316406,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:07:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:26:34 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 130.60245925788612,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.877478772904112,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:07:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:26:34 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5455.755123000927,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5440.578917499806,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:54:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:52:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 184.0125064293631,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 182.5354573026731,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:54:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:52:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 172.26980849954998,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 171.29180650044873,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:54:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:52:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 41.34378620451925,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41.02768010298062,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:54:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:52:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38.65310148491302,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38.25011724984416,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:54:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:52:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2630.551425500016,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17301.371945000028,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:26:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:33:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 120.0175972480065,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 224.18166553200354,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:26:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:33:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 83.42846149980687,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 181.0413339999286,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:26:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:33:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19.839985249298476,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 142.13134217250737,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:26:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:33:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.839909343441953,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 147.33623283996045,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:26:18 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:33:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3697.770496000885,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2525.048247500308,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:46:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:00:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 144.23739775337404,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 115.72952551065949,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:46:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:00:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 95.34760749920679,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.95160250013578,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:46:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:00:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.870631533442317,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18.661041977349463,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:46:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:00:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.040367541896167,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.853015171401005,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:46:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:00:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1896.827923500041,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 15602.865738499986,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:54:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:07:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 92.33098896669617,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 210.0430408546581,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:54:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:07:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 57.04740800001673,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 172.34792499994,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:54:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:07:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.077754018439807,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 126.3320489613039,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:54:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:07:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.319771208280605,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 131.8927493348655,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:54:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:07:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6178.988775500102,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 8026.447241000028,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:11:54 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:52:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 105.24486398669069,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 157.6388383813328,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:11:54 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:52:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.42057300013948,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 129.0641964999395,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:11:54 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:52:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.014338630567,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.9432166560783,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:11:54 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:52:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.53110167117665,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.63410415277631,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:11:54 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:52:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17629.28239049961,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6170.664217999956,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:33:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:11:55 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 228.15737318399079,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 105.15708289333816,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:33:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:11:55 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 183.52132100017116,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.19553099990117,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:33:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:11:55 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 146.45657947859132,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.07245940430774,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:33:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:11:55 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 152.18902241546186,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.50399300944225,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:33:11 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:11:55 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 8023.916034499962,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65001.06689899997,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:53:25 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:01:12 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 157.39932492533202,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 42183.08303530666,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:53:25 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:01:12 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 130.8959225000308,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 42266.78036499993,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:53:25 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:01:12 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.86249245694424,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 100.48058928235726,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:53:25 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:01:12 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.52022299866027,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 102.91195303058437,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:53:25 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:01:12 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
@@ -96954,1350 +96954,1350 @@ window.BENCHMARK_DATA = {
           "timestamp": "2024-06-04T18:22:13Z",
           "url": "https://github.com/neuralmagic/nm-vllm/commit/0257d9df154c888e4e47cbbb7386dad7c3ef100f"
         },
-        "date": 1717573292382,
+        "date": 1717573342462,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1997.095519499453,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 77863.63307100009,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:48:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:29:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 79.49410707335119,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58281.66058818401,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:48:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:29:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.92237950044364,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 71254.30481450008,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:48:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:29:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.423041893780688,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.18448559680624,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:48:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:29:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.522508858599581,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65.57614076542656,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:48:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:29:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3708.8484034998146,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6169.880697000053,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:43:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:14:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 144.70396740337796,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 106.46133026002947,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:43:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:14:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 96.86096099994757,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69.49139850007668,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:43:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:14:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.751408917969854,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34.919925371456834,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:43:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:14:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.16263272998981,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.40104160368872,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:43:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:14:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 70348.65259299931,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16823.780649499895,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:03:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:35:23 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 31540.73703901402,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 222.3978496020136,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:03:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:35:23 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 30937.1006654992,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 174.9299864995919,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:03:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:35:23 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 193.07305639892365,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 135.8074422525249,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:03:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:35:23 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 204.23880160253765,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 142.19356786475004,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:03:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:35:23 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6097.933693500181,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17811.0223725007,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:19:03 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:31:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 121.86066675333373,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1061.8634171593512,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:19:03 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:31:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 84.7730484999829,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 227.86593749970052,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:19:03 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:31:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.20517006487294,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 155.41090960013378,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:19:03 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:31:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.686507297140324,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 143.09971559737025,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:19:03 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:31:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6452.393932000006,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 3699.151089499537,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:40:03 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:48:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 110.7267989666669,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 142.86275332994893,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:40:03 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:48:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 70.27968650004368,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 93.76811899983295,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:40:03 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:48:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.68642417315979,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.791819456090682,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:40:03 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:48:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37.35562801671746,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.067366447461946,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:40:03 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:48:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6124.660605999907,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6002.976175000185,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:46:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:24:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 128.36626168998959,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 118.60854235867495,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:46:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:24:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 84.43713399992703,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 89.63529949960503,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:46:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:24:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.95552879690462,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 54.369098626418904,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:46:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:24:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.76947833644865,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 47.3331707284874,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:46:16 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:24:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1887.9630804999579,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2522.696570500102,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:54:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:02:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 91.94640251327655,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 115.05474123196716,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:54:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:02:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 60.55626250008572,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.29174350031099,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:54:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:02:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.012353670994822,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18.53805063470437,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:54:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:02:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.249874644191436,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.71867678207576,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:54:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:02:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 79988.87884600004,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5440.339908500391,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:27:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:56:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59738.638552652,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 183.3716414972999,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:27:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:56:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 73359.15299499992,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 170.48778099979245,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:27:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:56:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.03593499119127,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.94271573698874,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:27:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:56:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.07576343076235,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38.24281609001152,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:27:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:56:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6063.049027000488,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6110.254696499965,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:21:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:49:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 119.42886400928546,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 129.08200832666654,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:21:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:49:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 89.7157725003126,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 90.9221575001311,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:21:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:49:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 54.8214165161257,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.88085330292909,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:21:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:49:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 48.03518404416397,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.81224940285663,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:21:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:49:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5198.608692498965,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5154.448082000272,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:13:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:17:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 90.76383455999651,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 89.23465427997144,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:13:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:17:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 62.78620149987546,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 61.30725599996367,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:13:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:17:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.83553935022538,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.775983947753744,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:13:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:17:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 31.3345803907413,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31.273593288596942,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:13:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:17:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1966.553798500172,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 62147.10263950019,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:20:14 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:04:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 95.29166702000414,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39766.26892405266,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:20:14 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:04:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.26092300048913,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39408.58177550001,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:20:14 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:04:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.558316077371831,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 98.89464387296319,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:20:14 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:04:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.834920917726885,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 101.30746616648189,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:20:14 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:04:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19222.069294000903,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6096.493981000094,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:28:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:21:12 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1812.5878313566814,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 121.65361824999916,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:28:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:21:12 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 295.03822549941106,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 84.98213850020875,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:28:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:21:12 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 159.2759532426503,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.331489254710824,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:28:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:21:12 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 147.19713947191278,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.851920693577306,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:28:21 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:21:12 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 259086.0955725002,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7975.029886000016,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:42:10 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:56:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 244112.833214076,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 157.9884472533413,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:42:10 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:56:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 245980.72280350016,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 135.43239950001862,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:42:10 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:56:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 70.04524546886341,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.62101486594874,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:42:10 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:56:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 67.15701582574867,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.21389066095032,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:42:10 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:56:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2071.589972500533,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1992.651213499812,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:13:57 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:50:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 81.63230001999182,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 78.69765539995569,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:13:57 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:50:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 41.06669199973112,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41.1947960001271,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:13:57 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:50:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.828980113139927,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.402564449955385,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:13:57 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:50:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.936821029889687,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.479954827523413,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:13:57 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:50:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2636.5531265000755,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69047.98245550046,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:26:34 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:05:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 120.24517320798866,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 30681.225544506644,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:26:34 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:05:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 81.84799849959745,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 29819.8609330002,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:26:34 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:05:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19.85569717316406,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 191.35947444170938,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:26:34 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:05:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.877478772904112,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 201.84431503164717,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:26:34 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:05:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5440.578917499806,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2616.2008219998825,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:52:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:28:36 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 182.5354573026731,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 120.6278214760144,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:52:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:28:36 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 171.29180650044873,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 84.79476649972639,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:52:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:28:36 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 41.02768010298062,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19.69699261167778,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:52:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:28:36 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38.25011724984416,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.6767269947816,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:52:46 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:28:36 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17301.371945000028,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1964.6460755002408,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:33:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:22:17 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 224.18166553200354,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 95.28143891663603,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:33:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:22:17 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 181.0413339999286,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 60.55193900010636,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:33:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:22:17 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 142.13134217250737,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.63106773922918,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:33:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:22:17 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 147.33623283996045,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.841530274834774,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:33:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:22:17 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2525.048247500308,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2074.2727599999853,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:00:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:16:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 115.72952551065949,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 81.70022990000386,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:00:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:16:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.95160250013578,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41.03294449987516,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:00:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:16:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18.661041977349463,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.826978801172709,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:00:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:16:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.853015171401005,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.95713960298148,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:00:47 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:16:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 15602.865738499986,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 258395.27599550024,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:07:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:44:15 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 210.0430408546581,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 243924.9034436213,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:07:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:44:15 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 172.34792499994,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 245393.95490599985,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:07:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:44:15 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 126.3320489613039,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69.57896753179935,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:07:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:44:15 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 131.8927493348655,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67.31930061260277,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:07:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:44:15 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 8026.447241000028,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6430.6718875000115,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:52:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:43:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 157.6388383813328,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 110.98738446666934,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:52:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:43:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 129.0641964999395,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.75269950000984,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:52:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:43:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.9432166560783,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.660171282886196,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:52:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:43:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
-          {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.63410415277631,
+          {
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37.35048996213381,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:52:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:43:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6170.664217999956,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 14931.094657999893,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:11:55 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:09:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 105.15708289333816,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 208.00185868199515,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:11:55 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:09:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.19553099990117,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 169.5139149996976,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:11:55 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:09:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.07245940430774,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 121.07384939298134,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:11:55 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:09:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.50399300944225,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 127.14083376761975,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:11:55 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:09:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65001.06689899997,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1884.3689440000162,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:01:12 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:56:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 42183.08303530666,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 91.87971182333058,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:01:12 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:56:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 42266.78036499993,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 57.980469000540324,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:01:12 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:56:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 100.48058928235726,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.001923244417402,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:01:12 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:56:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 102.91195303058437,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.251956720453101,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:01:12 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:56:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
       {
         "commit": {
           "author": {
-            "name": "Michael Goin",
-            "username": "mgoin",
-            "email": "michael@neuralmagic.com"
+            "name": "dhuangnm",
+            "username": "dhuangnm",
+            "email": "74931910+dhuangnm@users.noreply.github.com"
           },
           "committer": {
             "name": "GitHub",
             "username": "web-flow",
             "email": "noreply@github.com"
           },
-          "id": "0257d9df154c888e4e47cbbb7386dad7c3ef100f",
-          "message": "Update README.md",
-          "timestamp": "2024-06-04T18:22:13Z",
-          "url": "https://github.com/neuralmagic/nm-vllm/commit/0257d9df154c888e4e47cbbb7386dad7c3ef100f"
+          "id": "367c5ee80cc75f5d5b6af72de5e1e5e463e386f7",
+          "message": "strip binaries (#283)\n\nCo-authored-by: dhuangnm <dhuang@MacBook-Pro-2.local>",
+          "timestamp": "2024-06-05T21:03:26Z",
+          "url": "https://github.com/neuralmagic/nm-vllm/commit/367c5ee80cc75f5d5b6af72de5e1e5e463e386f7"
         },
-        "date": 1717573342462,
+        "date": 1717659311471,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 77863.63307100009,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2071.795236000071,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:29:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:08:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58281.66058818401,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.62129434664408,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:29:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:08:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 71254.30481450008,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.27264349997495,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:29:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:08:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.18448559680624,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.834207957060189,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:29:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:08:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65.57614076542656,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.929855224038599,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:29:32 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:08:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6169.880697000053,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 258306.14780649988,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:14:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:36:02 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 106.46133026002947,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 243799.61443960338,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:14:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:36:02 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69.49139850007668,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 245117.86312399953,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:14:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:36:02 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34.919925371456834,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69.64041041772076,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:14:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:36:02 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.40104160368872,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67.2662787212779,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:14:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:36:02 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16823.780649499895,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7922.697949500048,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:35:23 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:48:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 222.3978496020136,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 157.31748106933415,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:35:23 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:48:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 174.9299864995919,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 125.85172500007502,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:35:23 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:48:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 135.8074422525249,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.316413436361834,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:35:23 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:48:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 142.19356786475004,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 57.95898594036765,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:35:23 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:48:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17811.0223725007,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6151.938715500023,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:31:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:41:35 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1061.8634171593512,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 127.07350583665477,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:31:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:41:35 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 227.86593749970052,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 84.0308049999976,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:31:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:41:35 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 155.41090960013378,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41.024544313732136,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:31:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:41:35 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 143.09971559737025,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.782349249063266,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:31:45 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:41:35 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3699.151089499537,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68037.37022549921,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:48:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:58:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 142.86275332994893,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 30100.42021927734,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:48:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:58:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 93.76811899983295,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 28906.142724000347,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:48:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:58:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.791819456090682,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 190.25804743862463,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:48:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:58:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.067366447461946,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 201.1107669688994,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:48:25 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:58:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6002.976175000185,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5146.531957000661,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:24:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:09:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 118.60854235867495,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 89.62724450000678,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:24:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:09:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 89.63529949960503,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.349385500150674,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:24:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:09:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 54.369098626418904,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.724407216546886,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:24:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:09:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 47.3331707284874,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31.250674101384273,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:24:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:09:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2522.696570500102,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6213.317461499855,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:02:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:05:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 115.05474123196716,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 104.93692238665668,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:02:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:05:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.29174350031099,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 64.11752150006578,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:02:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:05:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18.53805063470437,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34.909500825725175,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:02:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:05:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.71867678207576,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.38810673453767,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:02:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:05:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5440.339908500391,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6107.461079999894,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:56:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:12:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 183.3716414972999,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 122.31564741000209,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:56:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:12:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 170.48778099979245,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.53915549999147,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:56:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:12:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.94271573698874,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.25123814329617,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:56:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:12:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38.24281609001152,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.85368992077919,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 04:56:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:12:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6110.254696499965,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1993.1376780000392,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:49:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:43:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 129.08200832666654,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 78.2969698333424,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:49:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:43:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 90.9221575001311,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 42.129797999677976,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:49:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:43:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.88085330292909,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.455911070524213,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:49:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:43:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.81224940285663,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.530717232272059,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:49:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:43:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5154.448082000272,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16751.494377999734,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:17:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:27:33 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 89.23465427997144,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 219.94728134001465,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:17:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:27:33 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 61.30725599996367,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 174.75347900017368,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:17:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:27:33 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.775983947753744,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 137.40456832042594,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:17:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:27:33 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 31.273593288596942,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 142.76655642728957,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:17:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:27:33 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 62147.10263950019,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17889.05048000015,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:04:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:23:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39766.26892405266,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1083.3993673066313,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:04:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:23:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39408.58177550001,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 229.99817649906618,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:04:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:23:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 98.89464387296319,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 155.99140842961486,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:04:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:23:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 101.30746616648189,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 143.93596860500983,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:04:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:23:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6096.493981000094,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2619.253929000024,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:21:12 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:20:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 121.65361824999916,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 120.25586048800324,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:21:12 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:20:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 84.98213850020875,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 83.6595824998767,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:21:12 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:20:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.331489254710824,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19.589928769144386,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:21:12 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:20:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.851920693577306,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.50490839269328,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:21:12 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:20:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 7975.029886000016,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80558.27551649987,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:56:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:21:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 157.9884472533413,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 60218.67284408668,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:56:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:21:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 135.43239950001862,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 74254.7567985,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:56:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:21:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.62101486594874,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.85018591317444,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:56:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:21:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.21389066095032,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.08448521974842,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:56:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:21:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1992.651213499812,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 63741.531855999936,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:50:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:56:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 78.69765539995569,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41159.210393221336,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:50:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:56:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 41.1947960001271,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40212.179885499834,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:50:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:56:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.402564449955385,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 99.67944955550327,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:50:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:56:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.479954827523413,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 102.0865461263572,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:50:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:56:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69047.98245550046,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 15127.325313500023,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:05:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:02:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 30681.225544506644,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 210.2396104793285,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:05:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:02:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 29819.8609330002,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 168.13532199967085,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:05:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:02:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 191.35947444170938,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 122.39129914192421,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:05:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:02:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 201.84431503164717,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 128.2692568067652,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-05 05:05:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:02:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2616.2008219998825,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1895.625746499718,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:28:36 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:49:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 120.6278214760144,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 92.22735414663475,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:28:36 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:49:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 84.79476649972639,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.00738999960231,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:28:36 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:49:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19.69699261167778,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.011212652119212,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:28:36 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:49:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.6767269947816,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.276344271528881,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:28:36 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:49:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1964.6460755002408,
+            "value": 1950.5479389999891,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:22:17 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:14:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 95.28143891663603,
+            "value": 94.45890538333211,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:22:17 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:14:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 60.55193900010636,
+            "value": 59.743625999999495,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:22:17 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:14:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.63106773922918,
+            "value": 13.461409176814545,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:22:17 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:14:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
             "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.841530274834774,
+            "value": 12.74895741758177,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:22:17 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:14:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2074.2727599999853,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6046.4773440007775,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:16:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:16:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 81.70022990000386,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 118.64030966269152,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:16:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:16:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 41.03294449987516,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 88.55446850066073,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:16:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:16:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.826978801172709,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 54.628013159921274,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:16:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:16:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.95713960298148,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 47.59649299788427,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:16:02 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:16:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 258395.27599550024,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 3714.1871910007467,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:44:15 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:40:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 243924.9034436213,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 143.33834539669624,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:44:15 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:40:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 245393.95490599985,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 94.42250300071464,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:44:15 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:40:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69.57896753179935,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.83992388186532,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:44:15 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:40:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 67.31930061260277,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.07884507100644,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:44:15 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:40:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6430.6718875000115,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2505.978134999623,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:43:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:55:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 110.98738446666934,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 114.93389008799689,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:43:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:55:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.75269950000984,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 81.6131885003415,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:43:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:55:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.660171282886196,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18.713543749562035,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:43:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:55:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37.35048996213381,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.84215263645413,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 02:43:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:55:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 14931.094657999893,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5536.584911499631,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:09:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:48:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 208.00185868199515,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 182.84844596668216,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:09:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:48:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 169.5139149996976,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 174.18272499980958,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:09:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:48:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 121.07384939298134,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.93757760724695,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:09:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:48:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 127.14083376761975,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38.113197757742746,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 04:09:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:48:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1884.3689440000162,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6422.859499999959,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:56:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:35:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 91.87971182333058,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 111.53755632000184,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:56:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:35:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 57.980469000540324,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 70.94469050002772,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:56:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:35:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.001923244417402,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.634538236675354,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:56:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:35:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.251956720453101,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37.28795376571324,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-05 03:56:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:35:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
@@ -98318,668 +98318,668 @@ window.BENCHMARK_DATA = {
           "timestamp": "2024-06-05T21:03:26Z",
           "url": "https://github.com/neuralmagic/nm-vllm/commit/367c5ee80cc75f5d5b6af72de5e1e5e463e386f7"
         },
-        "date": 1717659311471,
+        "date": 1717659337293,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2071.795236000071,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7561.334372500028,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:08:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:53:55 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.62129434664408,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 155.53552857467307,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:08:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:53:55 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.27264349997495,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 131.37896199998522,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:08:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:53:55 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.834207957060189,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 56.343515332182015,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:08:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:53:55 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.929855224038599,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 55.79649200897705,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:08:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:53:55 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 258306.14780649988,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5124.215013000139,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:36:02 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:10:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 243799.61443960338,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 89.17597861989634,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:36:02 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:10:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 245117.86312399953,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59.99882400010392,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:36:02 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:10:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69.64041041772076,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.439815983475754,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:36:02 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:10:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 67.2662787212779,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 30.997657132015405,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:36:02 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:10:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 7922.697949500048,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1853.9279535002606,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:48:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:53:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 157.31748106933415,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 91.09767266334832,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:48:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:53:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 125.85172500007502,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 57.97557999994751,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:48:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:53:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.316413436361834,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.752838098566457,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:48:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:53:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 57.95898594036765,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.98620253284903,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:48:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:53:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6151.938715500023,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1971.939440999904,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:41:35 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:47:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 127.07350583665477,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 77.3307800733528,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:41:35 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:47:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 84.0308049999976,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37.88041450025048,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:41:35 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:47:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 41.024544313732136,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.245637929339898,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:41:35 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:47:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.782349249063266,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.335751873328213,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:41:35 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:47:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68037.37022549921,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6017.480400000068,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:58:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:47:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 30100.42021927734,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 126.19127569333918,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:58:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:47:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 28906.142724000347,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 85.02784850008993,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:58:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:47:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 190.25804743862463,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.38201789582426,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:58:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:47:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 201.1107669688994,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.42837691540775,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:58:06 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:47:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5146.531957000661,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 74014.74871899995,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:09:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:26:37 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 89.62724450000678,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 54848.37464709335,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:09:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:26:37 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.349385500150674,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67050.76776550004,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:09:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:26:37 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.724407216546886,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.75695212437148,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:09:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:26:37 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 31.250674101384273,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 64.18998813923736,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:09:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:26:37 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6213.317461499855,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2056.14391600011,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:05:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:12:24 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 104.93692238665668,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 79.60492640665811,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:05:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:12:24 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 64.11752150006578,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.21059399999649,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:05:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:12:24 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34.909500825725175,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.707244972260469,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:05:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:12:24 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.38810673453767,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.822381328736244,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:05:52 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:12:24 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6107.461079999894,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12958.427279500029,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:12:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:31:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 122.31564741000209,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 203.8325882873281,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:12:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:31:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.53915549999147,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 159.24496200022986,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:12:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:31:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.25123814329617,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 101.14719103000134,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:12:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:31:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.85368992077919,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 103.7680000369014,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:12:58 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:31:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1993.1376780000392,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12817.44234550024,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:43:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:25:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 78.2969698333424,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 162.61203520401736,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:43:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:25:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 42.129797999677976,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 131.56327699925896,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:43:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:25:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.455911070524213,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 114.50368467609007,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:43:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:25:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.530717232272059,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 102.96671823434735,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:43:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:25:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16751.494377999734,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1927.0397564996529,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:27:33 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:18:39 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 219.94728134001465,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 94.1372103999826,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:27:33 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:18:39 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 174.75347900017368,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.00312999963353,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:27:33 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:18:39 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 137.40456832042594,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.281667023213473,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:27:33 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:18:39 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 142.76655642728957,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.566237753947677,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:27:33 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:18:39 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17889.05048000015,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6131.616917499969,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:23:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:11:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1083.3993673066313,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 104.14621484665986,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:23:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:11:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 229.99817649906618,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 64.46932699986974,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:23:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:11:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 155.99140842961486,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34.78245623232327,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:23:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:11:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 143.93596860500983,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.30715814795518,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:23:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:11:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2619.253929000024,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2544.320060500013,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:20:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:24:56 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 120.25586048800324,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 118.19201419733629,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:20:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:24:56 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 83.6595824998767,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 84.03659450004852,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:20:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:24:56 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19.589928769144386,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18.966716106169642,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:20:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:24:56 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.50490839269328,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.082941231904826,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:20:45 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:24:56 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80558.27551649987,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6380.9977025000535,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:21:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:41:12 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 60218.67284408668,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 109.08876862667437,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:21:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:41:12 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 74254.7567985,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.93255199997839,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:21:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:41:12 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.85018591317444,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.34462272523681,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:21:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:41:12 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.08448521974842,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37.037268217760975,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:21:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:41:12 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 63741.531855999936,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 56907.804171500175,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:56:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:00:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 41159.210393221336,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 23996.813235266003,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:56:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:00:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40212.179885499834,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 22443.69704499968,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:56:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:00:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 99.67944955550327,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 180.794710544754,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:56:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:00:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 102.0865461263572,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 192.16674524366823,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:56:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:00:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 15127.325313500023,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11652.233324999543,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:02:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:06:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 210.2396104793285,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 188.99047598733705,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:02:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:06:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 168.13532199967085,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 147.68642900025952,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:02:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:06:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 122.39129914192421,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 88.104427365891,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:02:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:06:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 128.2692568067652,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 90.61193503831414,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:02:15 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:06:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1895.625746499718,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 3667.2081065003113,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:49:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:41:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 92.22735414663475,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 143.1880004566877,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:49:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:41:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.00738999960231,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 93.01893300016673,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:49:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:41:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.011212652119212,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.207180873330188,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:49:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:41:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.276344271528881,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 23.781346234008886,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:49:31 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:41:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1950.5479389999891,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5253.7163139995755,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:14:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:50:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 94.45890538333211,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 181.92108841200388,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:14:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:50:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59.743625999999495,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 176.7236440000488,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:14:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:50:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.461409176814545,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38.95742807844661,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:14:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:50:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.74895741758177,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.21478237179607,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:14:41 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:50:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6046.4773440007775,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 55046.67195750005,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:16:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:01:55 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 118.64030966269152,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34080.341097640005,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:16:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:01:55 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 88.55446850066073,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 33934.031472500006,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:16:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:01:55 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 54.628013159921274,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 95.86181144956682,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:16:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:01:55 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 47.59649299788427,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 98.16332444601062,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:16:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:01:55 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3714.1871910007467,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 251508.07848850012,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:40:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:41:07 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 143.33834539669624,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 237415.37543954464,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:40:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:41:07 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 94.42250300071464,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 238630.57078349972,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:40:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:41:07 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.83992388186532,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69.42414724961917,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:40:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:41:07 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.07884507100644,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65.95421583107844,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:40:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:41:07 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2505.978134999623,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5744.0356564993635,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:55:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:18:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 114.93389008799689,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 115.82568461998986,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:55:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:18:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 81.6131885003415,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 86.97315400058869,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:55:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:18:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18.713543749562035,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 52.39422652859815,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:55:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:18:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.84215263645413,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 45.508422680135055,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:55:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:18:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5536.584911499631,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6057.473102499898,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:48:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:18:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 182.84844596668216,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 121.94549031666763,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:48:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:18:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 174.18272499980958,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 81.69400949986994,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:48:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:18:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.93757760724695,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.587095191553765,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:48:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:18:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38.113197757742746,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.28696862664273,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:48:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:18:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6422.859499999959,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2406.3182155000504,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:35:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:59:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 111.53755632000184,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 113.72653467465352,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:35:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:59:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 70.94469050002772,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 79.61923700031548,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:35:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:59:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.634538236675354,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.890889228093677,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:35:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:59:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37.28795376571324,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.174985396382663,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:35:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:59:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
@@ -99000,668 +99000,668 @@ window.BENCHMARK_DATA = {
           "timestamp": "2024-06-05T21:03:26Z",
           "url": "https://github.com/neuralmagic/nm-vllm/commit/367c5ee80cc75f5d5b6af72de5e1e5e463e386f7"
         },
-        "date": 1717659337293,
+        "date": 1717659387678,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 7561.334372500028,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7988.147077999997,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:53:55 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:47:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 155.53552857467307,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 158.11015766800696,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:53:55 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:47:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 131.37896199998522,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 125.92036299997744,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:53:55 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:47:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 56.343515332182015,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.92950447171786,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:53:55 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:47:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 55.79649200897705,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.69751428298563,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:53:55 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:47:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5124.215013000139,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2515.5910209996364,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:10:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:55:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 89.17597861989634,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 115.1154712280028,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:10:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:55:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59.99882400010392,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.41899100044247,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:10:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:55:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.439815983475754,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18.64229827587129,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:10:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:55:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 30.997657132015405,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.854738917947568,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:10:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:55:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1853.9279535002606,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 259713.36875049997,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:53:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:36:06 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 91.09767266334832,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 244671.78918956334,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:53:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:36:06 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 57.97557999994751,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 246374.108513,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:53:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:36:06 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.752838098566457,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 70.70171740964336,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:53:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:36:06 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.98620253284903,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67.33495090392691,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:53:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:36:06 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1971.939440999904,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6103.2862680000335,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:47:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:41:21 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 77.3307800733528,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 128.34661319999742,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:47:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:41:21 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37.88041450025048,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 92.0437935000109,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:47:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:41:21 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.245637929339898,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.93754922583001,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:47:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:41:21 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.335751873328213,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.85879893618455,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:47:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:41:21 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6017.480400000068,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 3708.6690379992433,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:47:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:40:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 126.19127569333918,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 142.79501401664675,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:47:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:40:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 85.02784850008993,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 94.89751599994634,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:47:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:40:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.38201789582426,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.885508558223382,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:47:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:40:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.42837691540775,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.157567697125923,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:47:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:40:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 74014.74871899995,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5486.096011500194,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:26:37 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:48:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 54848.37464709335,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 182.72797567327993,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:26:37 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:48:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 67050.76776550004,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 173.86483949940157,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:26:37 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:48:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.75695212437148,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41.35778788926874,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:26:37 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:48:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 64.18998813923736,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38.40124741742852,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:26:37 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:48:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2056.14391600011,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17455.734022499655,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:12:24 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:27:50 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 79.60492640665811,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 224.04956231399652,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:12:24 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:27:50 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.21059399999649,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 179.17058600005475,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:12:24 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:27:50 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.707244972260469,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 143.4412121768222,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:12:24 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:27:50 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.822381328736244,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 148.41204767122895,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:12:24 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:27:50 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12958.427279500029,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5158.379417000106,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:31:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:09:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 203.8325882873281,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 89.40565544002311,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:31:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:09:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 159.24496200022986,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 62.71433749952848,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:31:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:09:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 101.14719103000134,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.766268181055466,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:31:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:09:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 103.7680000369014,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31.264735818489633,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:31:29 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:09:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12817.44234550024,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6169.043487500176,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:25:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:05:48 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 162.61203520401736,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 102.78861056664633,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:25:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:05:48 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 131.56327699925896,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.47407099990232,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:25:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:05:48 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 114.50368467609007,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.03913815392624,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:25:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:05:48 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 102.96671823434735,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.463414853667054,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:25:18 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:05:48 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1927.0397564996529,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6040.882092000174,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:18:39 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:16:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 94.1372103999826,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 118.98810139734755,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:18:39 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:16:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.00312999963353,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 87.44424650012661,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:18:39 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:16:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.281667023213473,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 54.86818574100186,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:18:39 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:16:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.566237753947677,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 47.89272645072441,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:18:39 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:16:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6131.616917499969,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6421.205358500003,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:11:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:35:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 104.14621484665986,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 110.06596085332755,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:11:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:35:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 64.46932699986974,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.34748050005146,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:11:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:35:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34.78245623232327,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.661673580311565,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:11:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:35:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.30715814795518,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37.346143724572,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:11:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:35:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2544.320060500013,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 79851.06389799989,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:24:56 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:21:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 118.19201419733629,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 60057.25443040801,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:24:56 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:21:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 84.03659450004852,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 73214.05616899983,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:24:56 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:21:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18.966716106169642,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.4950842808281,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:24:56 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:21:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.082941231904826,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.38619024632074,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:24:56 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:21:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6380.9977025000535,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1879.0043674998742,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:41:12 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:49:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 109.08876862667437,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 91.75263231003555,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:41:12 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:49:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.93255199997839,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59.382567999819,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:41:12 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:49:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.34462272523681,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.956603668530867,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:41:12 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:49:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37.037268217760975,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.210659433745182,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:41:12 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:49:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 56907.804171500175,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2067.1192844997677,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:00:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:08:52 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 23996.813235266003,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.71093437332213,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:00:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:08:52 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 22443.69704499968,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 42.59328949956398,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:00:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:08:52 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 180.794710544754,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.826639051328756,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:00:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:08:52 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 192.16674524366823,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.91504162451401,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:00:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:08:52 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11652.233324999543,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1995.768789000067,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:06:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:43:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 188.99047598733705,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 78.88001226663012,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:06:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:43:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 147.68642900025952,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.83630199966137,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:06:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:43:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 88.104427365891,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.412102007079442,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:06:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:43:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 90.61193503831414,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.484164304969804,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:06:00 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:43:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3667.2081065003113,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6115.007779500047,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:41:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:12:55 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 143.1880004566877,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 121.25073971667614,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:41:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:12:55 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 93.01893300016673,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 79.17800850009371,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:41:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:12:55 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.207180873330188,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.21653402766049,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:41:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:12:55 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 23.781346234008886,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.858037121296086,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:41:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:12:55 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5253.7163139995755,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 15728.803737999897,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:50:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:02:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 181.92108841200388,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 209.45778847200685,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:50:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:02:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 176.7236440000488,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 171.12318399995274,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:50:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:02:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38.95742807844661,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 127.92452536060189,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:50:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:02:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.21478237179607,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 133.71982527368462,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:50:30 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:02:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 55046.67195750005,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2631.9194974998936,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:01:55 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:20:58 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34080.341097640005,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 119.30705779998243,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:01:55 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:20:58 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 33934.031472500006,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 84.10202550021495,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:01:55 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:20:58 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 95.86181144956682,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19.851983777178372,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:01:55 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:20:58 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 98.16332444601062,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.862149778200866,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:01:55 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:20:58 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 251508.07848850012,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18910.703681000086,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:41:07 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:24:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 237415.37543954464,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1460.6714377166754,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:41:07 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:24:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 238630.57078349972,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 265.01184350036056,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:41:07 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:24:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69.42414724961917,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 157.63675098098742,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:41:07 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:24:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65.95421583107844,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 145.87901538779113,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:41:07 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:24:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5744.0356564993635,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 64955.77952849999,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:18:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:56:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 115.82568461998986,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 42256.55529801801,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:18:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:56:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 86.97315400058869,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41436.405920000085,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:18:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:56:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 52.39422652859815,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 100.85516992508515,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:18:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:56:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 45.508422680135055,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 103.39974855038686,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:18:24 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:56:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6057.473102499898,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1959.9752239996633,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:18:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:14:53 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 121.94549031666763,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 94.68746850665107,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:18:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:14:53 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 81.69400949986994,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 60.128760000679904,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:18:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:14:53 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.587095191553765,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.532258797718644,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:18:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:14:53 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.28696862664273,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.824990888030028,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:18:25 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:14:53 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2406.3182155000504,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 70450.58698949925,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:59:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:58:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 113.72653467465352,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31327.308545114716,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:59:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:58:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 79.61923700031548,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 30185.243406000154,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:59:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:58:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.890889228093677,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 191.92121175681964,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:59:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:58:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.174985396382663,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 203.58036901354376,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.11.4 (main, Jun  7 2023, 10:57:56) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:59:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:58:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
@@ -99682,1350 +99682,1350 @@ window.BENCHMARK_DATA = {
           "timestamp": "2024-06-05T21:03:26Z",
           "url": "https://github.com/neuralmagic/nm-vllm/commit/367c5ee80cc75f5d5b6af72de5e1e5e463e386f7"
         },
-        "date": 1717659387678,
+        "date": 1717659700478,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 7988.147077999997,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5444.995677999941,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:47:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:54:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 158.11015766800696,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 182.43153263602773,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:47:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:54:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 125.92036299997744,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 173.46796700076084,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:47:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:54:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.92950447171786,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41.280124068436784,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:47:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:54:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.69751428298563,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38.37203757283457,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:47:56 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:54:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2515.5910209996364,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 78778.1954510001,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:55:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:28:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 115.1154712280028,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59162.487471686676,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:55:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:28:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.41899100044247,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 72037.40187199991,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:55:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:28:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18.64229827587129,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.62343158062342,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:55:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:28:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.854738917947568,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.03294767678214,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:55:36 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:28:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 259713.36875049997,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1956.2861680001333,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:36:06 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:21:06 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 244671.78918956334,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 95.14779560665677,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:36:06 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:21:06 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 246374.108513,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59.077937999518326,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:36:06 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:21:06 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 70.70171740964336,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.437431639448432,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:36:06 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:21:06 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 67.33495090392691,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.723519856814338,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:36:06 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:21:06 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6103.2862680000335,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65373.15701099999,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:41:21 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:03:42 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 128.34661319999742,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 42277.50111589267,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:41:21 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:03:42 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 92.0437935000109,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41742.67372500003,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:41:21 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:03:42 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.93754922583001,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 100.17157111138543,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:41:21 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:03:42 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.85879893618455,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 102.43454124109734,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:41:21 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:03:42 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3708.6690379992433,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7956.53410649993,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:40:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:55:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 142.79501401664675,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 157.04044950666443,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:40:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:55:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 94.89751599994634,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 131.30081100007374,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:40:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:55:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.885508558223382,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.715331306465714,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:40:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:55:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.157567697125923,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.409235365153606,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:40:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:55:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5486.096011500194,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 70516.79164049984,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:48:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:04:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 182.72797567327993,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31495.424776235326,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:48:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:04:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 173.86483949940157,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 30031.702047999715,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:48:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:04:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 41.35778788926874,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 193.22762823587541,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:48:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:04:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38.40124741742852,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 204.57441203650677,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:48:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:04:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17455.734022499655,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 259011.95400250025,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:27:50 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:43:27 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 224.04956231399652,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 244055.78354314202,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:27:50 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:43:27 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 179.17058600005475,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 245917.08374050018,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:27:50 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:43:27 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 143.4412121768222,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 70.01394674477102,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:27:50 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:43:27 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 148.41204767122895,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67.20731673968393,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:27:50 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:43:27 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5158.379417000106,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2064.670552500047,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:09:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:15:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 89.40565544002311,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.15265723332655,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:09:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:15:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 62.71433749952848,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 42.677035499764315,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:09:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:15:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.766268181055466,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.785276597867119,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:09:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:15:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 31.264735818489633,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.899947824989589,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:09:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:15:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6169.043487500176,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18063.48664699999,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:05:48 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:34:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 102.78861056664633,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 232.45559611600157,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:05:48 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:34:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.47407099990232,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 182.47827300001518,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:05:48 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:34:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.03913815392624,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 149.0605816427107,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:05:48 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:34:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.463414853667054,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 153.05545275452593,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:05:48 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:34:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6040.882092000174,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 3704.064616999858,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:16:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:47:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 118.98810139734755,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 143.43571817667907,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:16:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:47:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 87.44424650012661,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 97.69973150014266,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:16:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:47:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 54.86818574100186,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.87243340627162,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:16:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:47:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 47.89272645072441,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.099637962123776,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:16:41 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:47:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6421.205358500003,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6106.5911880000385,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:35:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:20:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 110.06596085332755,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 122.00355583999226,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:35:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:20:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.34748050005146,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 83.82650199996533,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:35:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:20:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.661673580311565,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.20168109883727,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:35:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:20:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37.346143724572,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.8089047256667,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:35:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:20:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 79851.06389799989,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6035.800041500806,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:21:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:22:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 60057.25443040801,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 119.20267250667773,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:21:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:22:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 73214.05616899983,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 87.79566750035883,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:21:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:22:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.4950842808281,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 54.44995107384843,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:21:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:22:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.38619024632074,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 47.61603700214803,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:21:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:22:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1879.0043674998742,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1889.1761065005994,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:49:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:55:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 91.75263231003555,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 92.37879568333179,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:49:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:55:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59.382567999819,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 57.173135000084585,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:49:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:55:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.956603668530867,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.024526196932513,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:49:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:55:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.210659433745182,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.257026714855618,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:49:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:55:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2067.1192844997677,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5162.35110100024,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:08:52 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:15:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.71093437332213,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 88.73062119988997,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:08:52 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:15:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 42.59328949956398,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 61.62037499962025,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:08:52 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:15:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.826639051328756,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.810021177324366,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:08:52 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:15:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.91504162451401,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31.314041988770864,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:08:52 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:15:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1995.768789000067,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 15547.739225999976,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:43:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:08:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 78.88001226663012,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 209.84569314132264,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:43:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:08:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.83630199966137,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 169.8408844995356,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:43:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:08:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.412102007079442,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 125.59957043409153,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:43:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:08:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.484164304969804,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 131.1312142104596,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:43:33 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:08:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6115.007779500047,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6116.3640780000605,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:12:55 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:48:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 121.25073971667614,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 129.47920948666857,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:12:55 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:48:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 79.17800850009371,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 92.83823349994691,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:12:55 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:48:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.21653402766049,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.796792863182176,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:12:55 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:48:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.858037121296086,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.748115254378284,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:12:55 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:48:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 15728.803737999897,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2619.209377000061,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:02:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:27:10 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 209.45778847200685,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 119.57118906267958,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:02:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:27:10 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 171.12318399995274,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 83.23856800006979,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:02:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:27:10 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 127.92452536060189,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19.589286586639965,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:02:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:27:10 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 133.71982527368462,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.653235870041396,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:02:22 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:27:10 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2631.9194974998936,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2491.7438265001692,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:20:58 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:01:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 119.30705779998243,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 114.97949669731922,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:20:58 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:01:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 84.10202550021495,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 79.77798499996425,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:20:58 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:01:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19.851983777178372,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18.55062783805633,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:20:58 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:01:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.862149778200866,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.703622409928943,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:20:58 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:01:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18910.703681000086,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6432.053577000033,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:24:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:42:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1460.6714377166754,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 111.57128104666981,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:24:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:42:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 265.01184350036056,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 71.33699750005462,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:24:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:42:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 157.63675098098742,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.59753883390603,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:24:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:42:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 145.87901538779113,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37.31719864656108,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:24:39 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:42:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 64955.77952849999,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1985.349613499693,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:56:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:49:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 42256.55529801801,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 78.35227234002257,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:56:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:49:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 41436.405920000085,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.155220500106225,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:56:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:49:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 100.85516992508515,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.33685106604449,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:56:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:49:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 103.39974855038686,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.407172921438526,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:56:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:49:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1959.9752239996633,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6185.281503999931,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:14:53 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:13:12 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 94.68746850665107,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 106.85994023998623,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:14:53 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:13:12 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 60.128760000679904,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.0730799997491,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:14:53 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:13:12 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.532258797718644,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34.97793281004091,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:14:53 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:13:12 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.824990888030028,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.46808400963367,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:14:53 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:13:12 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 70450.58698949925,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18247.279312500723,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:58:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:30:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 31327.308545114716,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1292.66038330267,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:58:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:30:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 30185.243406000154,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 246.61586699949112,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:58:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:30:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 191.92121175681964,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 156.7642815834863,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:58:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:30:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 203.58036901354376,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 144.93258284940498,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:58:32 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:30:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
       {
         "commit": {
           "author": {
-            "name": "dhuangnm",
-            "username": "dhuangnm",
-            "email": "74931910+dhuangnm@users.noreply.github.com"
+            "name": "Derek Kozikowski",
+            "username": "derekk-nm",
+            "email": "106621615+derekk-nm@users.noreply.github.com"
           },
           "committer": {
             "name": "GitHub",
             "username": "web-flow",
             "email": "noreply@github.com"
           },
-          "id": "367c5ee80cc75f5d5b6af72de5e1e5e463e386f7",
-          "message": "strip binaries (#283)\n\nCo-authored-by: dhuangnm <dhuang@MacBook-Pro-2.local>",
-          "timestamp": "2024-06-05T21:03:26Z",
-          "url": "https://github.com/neuralmagic/nm-vllm/commit/367c5ee80cc75f5d5b6af72de5e1e5e463e386f7"
+          "id": "87571b8be8105738d6da87df053d5a32e7fa001e",
+          "message": "add more models, new num_logprobs (#285)\n\nadding the `microsoft/phi-2`, `google/gemma-1.1-2b-it`, and\r\n`HuggingFaceH4/zephyr-7b-gemma-v0.1` models to\r\ntest_basic_server_correctness.py. this required increasing the number of\r\nlogprobs included in the evaluation to avoid unexpected failure for a\r\nfew prompts with these models. this did not negatively impact the other\r\nmodels.\r\n\r\nran the test locally multiple times.  each time we passed, like this:\r\n```\r\n/root/pyvenv/nmv3119a/bin/python3 /root/.local/share/JetBrains/IntelliJIdea2023.3/python/helpers/pycharm/_jb_pytest_runner.py --target test_basic_server_correctness.py::test_models_on_server -- --forked \r\nTesting started at 2:24 PM ...\r\nLaunching pytest with arguments --forked test_basic_server_correctness.py::test_models_on_server --no-header --no-summary -q in /network/derekk/testdev1/nm-vllm/tests/basic_correctness\r\n\r\n============================= test session starts ==============================\r\ncollecting ... collected 7 items\r\nRunning 7 items in this shard: tests/basic_correctness/test_basic_server_correctness.py::test_models_on_server[None-5-32-mistralai/Mistral-7B-Instruct-v0.2-4096-None-None], tests/basic_correctness/test_basic_server_correctness.py::test_models_on_server[None-5-32-neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50-4096-sparse_w16a16-None], tests/basic_correctness/test_basic_server_correctness.py::test_models_on_server[None-5-32-NousResearch/Llama-2-7b-chat-hf-4096-None-None], tests/basic_correctness/test_basic_server_correctness.py::test_models_on_server[None-5-32-neuralmagic/Llama-2-7b-pruned70-retrained-ultrachat-4096-sparse_w16a16-None], tests/basic_correctness/test_basic_server_correctness.py::test_models_on_server[None-5-32-microsoft/phi-2-2048-None-None], tests/basic_correctness/test_basic_server_correctness.py::test_models_on_server[None-5-32-google/gemma-1.1-2b-it-2056-None-None], tests/basic_correctness/test_basic_server_correctness.py::test_models_on_server[None-5-32-HuggingFaceH4/zephyr-7b-gemma-v0.1-4096-None-None]\r\n\r\ntest_basic_server_correctness.py::test_models_on_server[None-5-32-mistralai/Mistral-7B-Instruct-v0.2-4096-None-None] \r\ntest_basic_server_correctness.py::test_models_on_server[None-5-32-neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50-4096-sparse_w16a16-None] \r\ntest_basic_server_correctness.py::test_models_on_server[None-5-32-NousResearch/Llama-2-7b-chat-hf-4096-None-None] \r\ntest_basic_server_correctness.py::test_models_on_server[None-5-32-neuralmagic/Llama-2-7b-pruned70-retrained-ultrachat-4096-sparse_w16a16-None] \r\ntest_basic_server_correctness.py::test_models_on_server[None-5-32-microsoft/phi-2-2048-None-None] \r\ntest_basic_server_correctness.py::test_models_on_server[None-5-32-google/gemma-1.1-2b-it-2056-None-None] \r\ntest_basic_server_correctness.py::test_models_on_server[None-5-32-HuggingFaceH4/zephyr-7b-gemma-v0.1-4096-None-None] \r\n\r\n======================== 7 passed in 1332.51s (0:22:12) ========================\r\n```",
+          "timestamp": "2024-06-06T20:15:52Z",
+          "url": "https://github.com/neuralmagic/nm-vllm/commit/87571b8be8105738d6da87df053d5a32e7fa001e"
         },
-        "date": 1717659700478,
+        "date": 1717745616435,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5444.995677999941,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 14699.765167500118,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:54:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:00:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 182.43153263602773,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 211.30380943466784,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:54:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:00:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 173.46796700076084,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 171.09230100004424,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:54:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:00:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 41.280124068436784,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 118.17419190265723,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:54:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:00:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38.37203757283457,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 123.72379252928387,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:54:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:00:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 78778.1954510001,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6101.630609999802,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:28:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:12:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59162.487471686676,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 121.55697118334956,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:28:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:12:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 72037.40187199991,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.46602499985056,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:28:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:12:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.62343158062342,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.0817014139771,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:28:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:12:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.03294767678214,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.7675226272593,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:28:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:12:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1956.2861680001333,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2071.710462000283,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:21:06 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:07:21 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 95.14779560665677,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.17708044670144,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:21:06 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:07:21 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59.077937999518326,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.36620849981409,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:21:06 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:07:21 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.437431639448432,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.851890254539338,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:21:06 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:07:21 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.723519856814338,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.968660420963158,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:21:06 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:07:21 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65373.15701099999,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7952.279740999984,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:03:42 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:47:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 42277.50111589267,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 157.69975958266573,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:03:42 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:47:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 41742.67372500003,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 130.6232330000512,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:03:42 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:47:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 100.17157111138543,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.555887026381384,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:03:42 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:47:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 102.43454124109734,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.24977585960107,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:03:42 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:47:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 7956.53410649993,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1989.4785374999628,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:55:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:42:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 157.04044950666443,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 78.30192025334934,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:55:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:42:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 131.30081100007374,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.08235500057344,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:55:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:42:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.715331306465714,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.371696815007207,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:55:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:42:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.409235365153606,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.450931988206621,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:55:23 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:42:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 70516.79164049984,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 63511.5111409998,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:04:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:56:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 31495.424776235326,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41315.98948766067,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:04:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:56:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 30031.702047999715,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40922.82402299998,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:04:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:56:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 193.22762823587541,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 100.2348996305846,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:04:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:56:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 204.57441203650677,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 102.47305210424202,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:04:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:56:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 259011.95400250025,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16881.950900999982,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:43:27 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:26:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 244055.78354314202,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 227.5808386700167,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:43:27 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:26:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 245917.08374050018,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 180.28589000005013,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:43:27 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:26:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 70.01394674477102,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 137.69965027748881,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:43:27 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:26:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 67.20731673968393,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 144.0477701306793,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:43:27 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:26:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2064.670552500047,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68539.91656400103,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:15:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:56:48 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.15265723332655,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 30583.081945974624,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:15:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:56:48 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 42.677035499764315,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 29791.70794899983,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:15:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:56:48 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.785276597867119,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 191.3069806478081,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:15:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:56:48 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.899947824989589,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 202.39804439303757,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:15:05 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:56:48 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18063.48664699999,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6406.818620500018,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:34:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:35:04 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 232.45559611600157,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 110.17746072666644,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:34:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:35:04 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 182.47827300001518,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 70.06669449998526,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:34:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:35:04 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 149.0605816427107,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.55905916254394,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:34:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:35:04 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 153.05545275452593,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37.23628466129036,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:34:04 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:35:04 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3704.064616999858,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 79631.96407450005,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:47:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:21:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 143.43571817667907,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59710.224629344004,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:47:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:21:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 97.69973150014266,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 73366.30043800005,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:47:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:21:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.87243340627162,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.95557728360805,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:47:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:21:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.099637962123776,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.4342788293216,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 04:47:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:21:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6106.5911880000385,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2621.152668999912,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:20:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:19:24 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 122.00355583999226,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 119.64845341196997,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:20:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:19:24 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 83.82650199996533,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 84.57594350011277,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:20:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:19:24 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.20168109883727,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19.723464604925574,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:20:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:19:24 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.8089047256667,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.694372836787924,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:20:20 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:19:24 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6035.800041500806,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 258293.03068149966,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:22:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:35:47 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 119.20267250667773,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 243293.79929580932,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:22:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:35:47 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 87.79566750035883,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 245236.71960799993,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:22:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:35:47 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 54.44995107384843,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69.82026946569562,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:22:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:35:47 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 47.61603700214803,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.97497076864781,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:22:57 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:35:47 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1889.1761065005994,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 3688.6902534997716,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:55:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:39:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 92.37879568333179,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 143.05877963339904,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:55:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:39:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 57.173135000084585,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 92.61334450093273,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:55:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:39:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.024526196932513,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.643208871601594,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:55:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:39:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.257026714855618,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.011863460333114,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:55:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:39:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5162.35110100024,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1972.461887499776,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:15:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:13:21 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 88.73062119988997,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 95.00300123664299,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:15:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:13:21 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 61.62037499962025,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 61.475705999782804,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:15:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:13:21 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.810021177324366,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.541939674034946,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:15:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:13:21 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 31.314041988770864,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.823523306634206,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:15:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:13:21 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 15547.739225999976,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2517.0589059998747,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:08:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:54:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 209.84569314132264,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 115.37199964132863,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:08:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:54:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 169.8408844995356,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 81.5983624997898,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:08:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:54:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 125.59957043409153,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18.680460189964677,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:08:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:54:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 131.1312142104596,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.916389410917894,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:08:37 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:54:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6116.3640780000605,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5158.889067000018,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:48:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:07:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 129.47920948666857,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 88.69670423333446,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:48:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:07:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 92.83823349994691,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 62.149140499059286,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:48:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:07:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.796792863182176,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.788614128731986,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:48:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:07:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.748115254378284,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31.294957475099334,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:48:49 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:07:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2619.209377000061,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5423.314423500415,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:27:10 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:47:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 119.57118906267958,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 183.49934367465175,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:27:10 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:47:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 83.23856800006979,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 172.76279899942892,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:27:10 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:47:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19.589286586639965,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.98082605136603,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:27:10 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:47:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.653235870041396,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38.07007129864882,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:27:10 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:47:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2491.7438265001692,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6169.823153999914,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:01:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:05:35 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 114.97949669731922,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 105.81503897332975,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:01:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:05:35 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 79.77798499996425,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.29999150004551,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:01:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:05:35 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18.55062783805633,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 34.925416817931676,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:01:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:05:35 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.703622409928943,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.43782526824139,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 04:01:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:05:35 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6432.053577000033,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17779.014052000093,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:42:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:22:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 111.57128104666981,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 964.8561994160129,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:42:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:22:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 71.33699750005462,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 216.97776799919666,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:42:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:22:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.59753883390603,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 155.16468318669814,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:42:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:22:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37.31719864656108,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 142.75762987123997,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 02:42:36 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:22:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1985.349613499693,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6098.303635500088,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:49:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:41:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 78.35227234002257,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 127.7963707633406,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:49:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:41:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.155220500106225,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 87.85856650001733,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:49:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:41:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.33685106604449,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.72981414162362,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:49:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:41:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.407172921438526,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.67089046480836,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:49:52 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:41:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6185.281503999931,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6013.731897999605,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:13:12 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:14:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 106.85994023998623,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 119.57696941996983,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:13:12 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:14:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.0730799997491,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 88.63940199898934,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:13:12 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:14:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34.97793281004091,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 54.343427984564606,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:13:12 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:14:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.46808400963367,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 47.349752413306696,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-06 03:13:12 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:14:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18247.279312500723,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1882.2477829999116,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:30:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:48:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1292.66038330267,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 92.09608332000546,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:30:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:48:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 246.61586699949112,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 60.545114999968064,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:30:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:48:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 156.7642815834863,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.970106767876448,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:30:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:48:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 144.93258284940498,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.26096215724809,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-06 05:30:38 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:48:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
@@ -101046,668 +101046,668 @@ window.BENCHMARK_DATA = {
           "timestamp": "2024-06-06T20:15:52Z",
           "url": "https://github.com/neuralmagic/nm-vllm/commit/87571b8be8105738d6da87df053d5a32e7fa001e"
         },
-        "date": 1717745616435,
+        "date": 1717746402374,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 14699.765167500118,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2517.7662810001493,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:00:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:06:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 211.30380943466784,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 115.19161227997877,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:00:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:06:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 171.09230100004424,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.63697599982333,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:00:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:06:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 118.17419190265723,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18.721679456002857,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:00:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:06:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 123.72379252928387,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.92413555893253,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:00:55 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:06:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6101.630609999802,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5158.582335999199,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:12:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:21:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 121.55697118334956,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 90.95263102667862,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:12:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:21:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.46602499985056,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 64.343397999437,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:12:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:21:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.0817014139771,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.75590584696172,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:12:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:21:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.7675226272593,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31.293732228919737,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:12:42 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:21:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2071.710462000283,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 259032.36060049993,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:07:21 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:48:15 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.17708044670144,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 244612.28835521068,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:07:21 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:48:15 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.36620849981409,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 246070.47978950004,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:07:21 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:48:15 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.851890254539338,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 70.27743964452591,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:07:21 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:48:15 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.968660420963158,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67.33746257589912,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:07:21 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:48:15 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 7952.279740999984,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7973.015396499932,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:47:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:00:05 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 157.69975958266573,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 157.49508787200284,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:47:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:00:05 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 130.6232330000512,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 132.383733499978,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:47:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:00:05 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.555887026381384,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.95818823991842,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:47:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:00:05 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.24977585960107,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.70170960125752,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:47:51 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:00:05 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1989.4785374999628,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 3715.0806455001657,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:42:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:52:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 78.30192025334934,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 143.27519131335671,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:42:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:52:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.08235500057344,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 90.99995199994737,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:42:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:52:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.371696815007207,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 25.068206341116543,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:42:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:52:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.450931988206621,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.23450235230215,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:42:12 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:52:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 63511.5111409998,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 78549.1150764999,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:56:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:33:28 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 41315.98948766067,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58930.692567949336,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:56:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:33:28 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40922.82402299998,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 71195.70304599984,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:56:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:33:28 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 100.2348996305846,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.66825277654573,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:56:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:33:28 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 102.47305210424202,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.38126489244029,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:56:08 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:33:28 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16881.950900999982,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2075.8854539999447,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:26:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:19:56 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
-          {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 227.5808386700167,
+          {
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.4134203466371,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:26:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:19:56 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 180.28589000005013,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41.12088950068937,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:26:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:19:56 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 137.69965027748881,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.946538959099508,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:26:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:19:56 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 144.0477701306793,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.962208010001273,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:26:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:19:56 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68539.91656400103,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6127.476792999914,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:56:48 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:25:03 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 30583.081945974624,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 121.78522272334249,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:56:48 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:25:03 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 29791.70794899983,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 78.03906449976239,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:56:48 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:25:03 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 191.3069806478081,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.32860101464848,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:56:48 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:25:03 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 202.39804439303757,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.88441506334213,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:56:48 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:25:03 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6406.818620500018,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5473.464693999631,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:35:04 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:00:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 110.17746072666644,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 182.49806569066035,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:35:04 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:00:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 70.06669449998526,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 172.2819434990015,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:35:04 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:00:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.55905916254394,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41.36208355074749,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:35:04 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:00:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37.23628466129036,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38.49202614536779,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:35:04 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:00:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 79631.96407450005,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6120.321745000069,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:21:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:53:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59710.224629344004,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 128.82724009000412,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:21:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:53:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 73366.30043800005,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 89.28278450002836,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:21:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:53:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.95557728360805,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.885607378811606,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:21:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:53:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.4342788293216,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.90211287091744,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:21:05 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:53:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2621.152668999912,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17906.05553700061,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:19:24 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:36:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 119.64845341196997,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1134.9781032719814,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:19:24 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:36:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 84.57594350011277,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 242.94661450039712,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:19:24 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:36:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19.723464604925574,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 156.13205289057584,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:19:24 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:36:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.694372836787924,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 144.35530311353594,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:19:24 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:36:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 258293.03068149966,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6063.1630145007875,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:35:47 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:28:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 243293.79929580932,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 119.01546864803822,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:35:47 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:28:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 245236.71960799993,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 89.85463899989554,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:35:47 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:28:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69.82026946569562,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 54.88936309995303,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:35:47 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:28:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.97497076864781,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 47.83838556979124,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:35:47 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:28:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3688.6902534997716,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6211.771980999856,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:39:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:17:56 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 143.05877963339904,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 104.04621746668151,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:39:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:17:56 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 92.61334450093273,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 70.50385799993819,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:39:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:17:56 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.643208871601594,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.00779937451883,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:39:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:17:56 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.011863460333114,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.450362258075806,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:39:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:17:56 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1972.461887499776,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17628.689927000323,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:13:21 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:39:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 95.00300123664299,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 230.70867368067593,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:13:21 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:39:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 61.475705999782804,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 188.1116634999671,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:13:21 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:39:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.541939674034946,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 145.20456577223268,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:13:21 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:39:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.823523306634206,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 149.8547744586407,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:13:21 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:39:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2517.0589059998747,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1895.8214969998153,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:54:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:00:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 115.37199964132863,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 92.05938659671423,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:54:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:00:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 81.5983624997898,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 60.38087849992735,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:54:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:00:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18.680460189964677,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.053351842604936,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:54:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:00:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.916389410917894,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.341031605919355,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:54:14 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:00:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5158.889067000018,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6432.436080499997,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:07:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:47:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 88.69670423333446,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 111.54317384666001,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:07:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:47:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 62.149140499059286,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 70.22862650001116,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:07:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:47:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.788614128731986,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.64588538272693,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:07:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:47:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 31.294957475099334,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37.32310866398179,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:07:53 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:47:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5423.314423500415,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 15676.351509999677,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:47:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:13:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 183.49934367465175,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 214.77793233064767,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:47:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:13:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 172.76279899942892,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 173.0857054999433,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:47:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:13:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.98082605136603,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 126.57796654302709,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:47:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:13:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38.07007129864882,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 131.56042148320319,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:47:02 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:13:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6169.823153999914,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1975.4753754996273,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:05:35 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:26:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 105.81503897332975,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 94.6554390200193,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:05:35 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:26:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.29999150004551,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59.03294850031671,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:05:35 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:26:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 34.925416817931676,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.590512873251088,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:05:35 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:26:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.43782526824139,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.86248584167194,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:05:35 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:26:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17779.014052000093,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65101.35466450004,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:22:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:08:26 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 964.8561994160129,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 42475.126850674664,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:22:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:08:26 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 216.97776799919666,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41829.31294600007,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:22:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:08:26 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 155.16468318669814,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 101.03935240576146,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:22:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:08:26 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 142.75762987123997,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 103.5619433051051,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:22:51 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:08:26 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6098.303635500088,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2632.2685649997766,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:41:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:32:32 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 127.7963707633406,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 120.501228742641,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:41:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:32:32 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 87.85856650001733,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 85.52717499969731,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:41:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:32:32 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.72981414162362,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19.762526092246535,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:41:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:32:32 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.67089046480836,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.859516402474487,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:41:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:32:32 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6013.731897999605,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69213.93870249995,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:14:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:10:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 119.57696941996983,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 30924.398834644016,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:14:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:10:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 88.63940199898934,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 29796.208432499952,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:14:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:10:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 54.343427984564606,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 191.66613274661282,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:14:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:10:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 47.349752413306696,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 202.69734388710387,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:14:54 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:10:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1882.2477829999116,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1998.5766394997881,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:48:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:54:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 92.09608332000546,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 78.00553531333813,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:48:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:54:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 60.545114999968064,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41.35294949992385,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:48:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:54:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.970106767876448,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.49801165393879,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:48:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:54:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.26096215724809,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.525099976594081,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:38:31) [GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:48:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:54:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       },
@@ -101728,668 +101728,668 @@ window.BENCHMARK_DATA = {
           "timestamp": "2024-06-06T20:15:52Z",
           "url": "https://github.com/neuralmagic/nm-vllm/commit/87571b8be8105738d6da87df053d5a32e7fa001e"
         },
-        "date": 1717746402374,
+        "date": 1717746526012,
         "tool": "customSmallerIsBetter",
         "benches": [
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2517.7662810001493,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6421.003128999984,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:06:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:49:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 115.19161227997877,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 111.42623192666709,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:06:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:49:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.63697599982333,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 68.67188750004516,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:06:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:49:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 18.721679456002857,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.67433006480225,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:06:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:49:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 16.92413555893253,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 37.347861589580766,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:06:44 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:49:38 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5158.582335999199,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 70343.46215499955,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:21:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:12:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 90.95263102667862,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31386.580312674017,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:21:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:12:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 64.343397999437,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 30286.047331999725,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:21:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:12:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.75590584696172,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 192.48230756539462,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:21:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:12:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 31.293732228919737,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 204.17271244945383,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:21:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:12:01 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 259032.36060049993,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 7974.283517999993,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:48:15 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:02:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 244612.28835521068,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 157.71680846665973,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:48:15 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:02:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 246070.47978950004,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 132.23167049989115,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:48:15 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:02:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 70.27743964452591,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59.148484691931976,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:48:15 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:02:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 67.33746257589912,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.772616362214926,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:48:15 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:02:24 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 7973.015396499932,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1878.2174769999074,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:00:05 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:02:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 157.49508787200284,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 91.77253980332655,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:00:05 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:02:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 132.383733499978,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 60.431901500123786,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:00:05 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:02:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.95818823991842,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.972919783518263,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:00:05 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:02:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58.70170960125752,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.254923318192837,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:00:05 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:02:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 3715.0806455001657,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 15800.989984999433,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:52:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:15:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 143.27519131335671,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 212.69218189532452,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:52:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:15:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 90.99995199994737,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 171.57445400016513,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:52:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:15:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 25.068206341116543,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 127.97760569107744,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:52:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:15:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 24.23450235230215,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 133.28244030676518,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:52:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:15:43 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 78549.1150764999,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6190.521204500101,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:33:28 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:20:11 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 58930.692567949336,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 104.20235481998739,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:33:28 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:20:11 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 71195.70304599984,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65.80370149981718,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:33:28 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:20:11 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 68.66825277654573,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.01656560674975,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:33:28 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:20:11 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 66.38126489244029,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 35.58056303142888,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:33:28 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:20:11 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2075.8854539999447,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 260384.93473999962,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:19:56 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:50:30 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 80.4134203466371,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 245451.56963145733,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:19:56 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:50:30 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 41.12088950068937,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 246618.5867190002,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:19:56 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:50:30 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.946538959099508,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 70.43640871225261,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:19:56 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:50:30 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.962208010001273,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 67.46840009518934,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:19:56 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:50:30 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6127.476792999914,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 65326.86430050012,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:25:03 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:10:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 121.78522272334249,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 42519.619072352,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:25:03 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:10:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 78.03906449976239,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 42629.212142499906,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:25:03 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:10:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.32860101464848,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 100.75692873844417,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:25:03 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:10:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 39.88441506334213,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 102.79274229096899,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:25:03 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:10:44 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 5473.464693999631,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5447.402925999995,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:00:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:02:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 182.49806569066035,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 182.44597291333534,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:00:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:02:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 172.2819434990015,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 172.4453029992219,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:00:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:02:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 41.36208355074749,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41.31002367006393,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:00:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:02:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 38.49202614536779,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 38.53592806996797,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:00:16 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:02:13 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6120.321745000069,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2636.8615165001756,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:53:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:34:46 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 128.82724009000412,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 120.61561626666784,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:53:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:34:46 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 89.28278450002836,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 83.31963150021693,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:53:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:34:46 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.885607378811606,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 19.77806716244339,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:53:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:34:46 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 40.90211287091744,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17.863477117162816,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:53:30 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:34:46 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17906.05553700061,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1964.9785370002064,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:36:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:28:26 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1134.9781032719814,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 94.92590485000619,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:36:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:28:26 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 242.94661450039712,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 61.846054999932676,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:36:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:28:26 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 156.13205289057584,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 13.529513843147392,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:36:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:28:26 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 144.35530311353594,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 12.801651558912493,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:36:09 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:28:26 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6063.1630145007875,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 17789.619821000088,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:28:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:41:39 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 119.01546864803822,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 223.7930222799987,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:28:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:41:39 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 89.85463899989554,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 178.4341494999353,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:28:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:41:39 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 54.88936309995303,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 147.45370180062469,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:28:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:41:39 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 47.83838556979124,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 152.08704081000582,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:28:11 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:41:39 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6211.771980999856,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 3706.348058501135,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:17:56 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:54:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 104.04621746668151,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 144.10143136341503,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:17:56 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:54:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 70.50385799993819,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 91.89893900111201,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:17:56 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:54:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.00779937451883,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.774852731533024,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:17:56 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:54:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 35.450362258075806,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 24.09421720982175,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:17:56 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 04:54:26 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17628.689927000323,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6124.95311549992,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:39:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:55:50 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 230.70867368067593,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 127.61545218332762,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:39:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:55:50 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 188.1116634999671,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 79.0930150000122,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:39:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:55:50 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 145.20456577223268,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41.003689081342976,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:39:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:55:50 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 149.8547744586407,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.82435296499245,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:39:25 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:55:50 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1895.8214969998153,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2077.677904000211,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:00:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:22:10 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 92.05938659671423,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.92628254669155,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:00:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:22:10 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 60.38087849992735,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 41.31876200017359,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:00:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:22:10 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.053351842604936,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.923252010410245,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:00:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:22:10 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.341031605919355,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.962807159673819,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:00:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:22:10 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 6432.436080499997,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6066.111044500758,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:47:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:30:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 111.54317384666001,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 120.11073751601604,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:47:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:30:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 70.22862650001116,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 89.60528650004562,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:47:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:30:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 36.64588538272693,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 54.64651420804996,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:47:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:30:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 37.32310866398179,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 47.674594944977116,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 02:47:17 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:30:07 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 15676.351509999677,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 6116.888871499896,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:13:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:27:17 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 214.77793233064767,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 121.58046511666726,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:13:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:27:17 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 173.0857054999433,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.99082649982847,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:13:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:27:17 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 126.57796654302709,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 40.2163011519034,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:13:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:27:17 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 131.56042148320319,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.91395159878737,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:13:29 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:27:17 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1975.4753754996273,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 79834.82763249982,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:26:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:35:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 94.6554390200193,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 59873.52952924534,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:26:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:35:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 59.03294850031671,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 73565.9825685002,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:26:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:35:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 13.590512873251088,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 69.03438943049908,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:26:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:35:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 12.86248584167194,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 66.48010893843285,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:26:13 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:35:41 UTC\",\n  \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 65101.35466450004,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 2537.462056500317,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:08:26 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:08:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 42475.126850674664,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 115.79581256398403,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:08:26 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:08:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 41829.31294600007,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 80.16767150002124,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:08:26 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:08:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 101.03935240576146,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18.801487508640506,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:08:26 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:08:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 103.5619433051051,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 16.960467193772498,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:08:26 UTC\",\n  \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:08:59 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 2632.2685649997766,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 18870.926144501027,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:32:32 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:37:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 120.501228742641,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1503.0183703219845,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:32:32 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:37:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 85.52717499969731,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 265.5922145004297,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:32:32 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:37:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 19.762526092246535,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 157.79824274366626,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:32:32 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:37:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 17.859516402474487,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 146.09656217046984,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      750,\n      \"2.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 04:32:32 UTC\",\n  \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:37:49 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 69213.93870249995,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 5155.054059499889,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:10:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:23:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 30924.398834644016,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 89.98122839994419,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:10:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:23:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 29796.208432499952,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 58.98962949959241,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:10:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:23:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 191.66613274661282,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 36.717080017877386,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:10:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:23:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 202.69734388710387,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 31.255065943148917,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      1500,\n      \"5.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:10:03 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n  },\n  \"date\": \"2024-06-07 05:23:05 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 1998.5766394997881,
+            "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 1996.6575409998768,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:54:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:56:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 78.00553531333813,
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 79.3403228799798,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:54:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:56:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 41.35294949992385,
+            "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 39.85432399986166,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:54:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:56:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.49801165393879,
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.385212283854328,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:54:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:56:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           },
           {
-            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
-            "value": 11.525099976594081,
+            "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}",
+            "value": 11.468586433802306,
             "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.8.17 (default, Jun  7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:54:40 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.0\",\n    \"python_version\": \"3.9.17 (main, Jun  7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n    \"torch_version\": \"2.3.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A10G\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A10G x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"localhost\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      150,\n      \"0.5\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-06-07 03:56:56 UTC\",\n  \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
       }